In [1]:
# Step 1: Install & Import Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Load Dataset (Replace 'YourDataset.csv' with your actual file name)
file_path = "/content/CarPrice_Assignment_.csv"  # Update this path
df = pd.read_csv(file_path)

# Step 3: Display Basic Information About the Dataset
print("Dataset Overview:\n")
print(df.info())  # Column names, data types, missing values
print("\nFirst 5 Rows of the Dataset:\n")
print(df.head())

# Step 4: Handling Missing Values
print("\nChecking for Missing Values:\n")
print(df.isnull().sum())  # Count missing values per column

# Strategy to handle missing values:
# 1. Fill numerical missing values with median
# 2. Fill categorical missing values with mode
for col in df.columns:
    if df[col].dtype == 'object':  # If categorical
        df[col] = df[col].fillna(df[col].mode()[0])  # Assign back to the column
    else:  # If numerical
        df[col] = df[col].fillna(df[col].median())  # Assign back to the column

print("\nMissing Values After Handling:\n")
print(df.isnull().sum())

# Step 5: Basic Statistics
print("\nBasic Statistics of Numerical Features:\n")
print(df.describe())  # Mean, Median, Standard Deviation, Min, Max

# Step 6: Identifying Outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=['float64', 'int64']))
plt.xticks(rotation=90)
plt.title("Boxplot for Outlier Detection")
plt.show()

# Z-score method to detect outliers
from scipy.stats import zscore
z_scores = np.abs(zscore(df.select_dtypes(include=['float64', 'int64'])))
outlier_threshold = 3
outliers = (z_scores > outlier_threshold).sum(axis=0)
print("\nNumber of Outliers Per Column:\n")
print(outliers)


# Step 7: Key Analysis

# 1. Examining Distributions
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
n_cols = len(numerical_columns)
n_rows = (n_cols // 3) + (n_cols % 3 > 0)  # Dynamically determine the number of rows for subplots

plt.figure(figsize=(12, 6))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(n_rows, 3, i)  # Adjusting the subplot grid based on the number of columns
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# 2. Examining Correlations (Only numeric columns)
numerical_df = df.select_dtypes(include=['float64', 'int64'])  # Only numerical columns
correlation_matrix = numerical_df.corr()  # Calculate correlation on numeric data
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()

# 3. Identifying Potential Key Features for Prediction
# We can examine correlations, and based on that, we will focus on the features most correlated with the target variable.

# Step 3: Identifying Key Features Based on Correlation with Target Variable (assuming target variable is 'Price')
target = 'price ($)'  # Replace 'Price' with the actual target variable name if it's different
if target in numerical_df.columns:
    correlation_with_target = correlation_matrix[target].sort_values(ascending=False)
    print(f"\nCorrelation with {target}:\n")
    print(correlation_with_target)

    # Visualizing the relationship between key features and the target variable
    plt.figure(figsize=(12, 6))
    for i, col in enumerate(correlation_with_target.index[1:4], 1):  # Top 3 most correlated features
        plt.subplot(1, 3, i)
        sns.scatterplot(x=df[col], y=df[target])
        plt.title(f'{col} vs {target}')
    plt.tight_layout()
    plt.show()
else:
    print(f"Target variable '{target}' not found in numeric columns.")


# Step 8: Key Metrics Analysis

# 1. Correlation Coefficients
# First, select only the numerical columns
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Now calculate the correlation coefficients
correlation_coefficients = numerical_df.corr()
print("\nCorrelation Coefficients:\n")
print(correlation_coefficients)

# Visualizing the Correlation Matrix (Heatmap)
plt.figure(figsize=(9, 6))
sns.heatmap(correlation_coefficients, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()

# 2. Variance & Standard Deviation
# Calculate the variance and standard deviation for numerical features
variance = numerical_df.var()
std_deviation = numerical_df.std()

print("\nVariance of Numerical Features:\n")
print(variance)

print("\nStandard Deviation of Numerical Features:\n")
print(std_deviation)

# Plotting Variance and Standard Deviation for Comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plotting Variance
sns.barplot(x=variance.index, y=variance.values, ax=axes[0])
axes[0].set_title("Variance of Numerical Features")
axes[0].set_xticks(range(len(variance)))  # Set the tick positions
axes[0].set_xticklabels(variance.index, rotation=90)

# Plotting Standard Deviation
sns.barplot(x=std_deviation.index, y=std_deviation.values, ax=axes[1])
axes[1].set_title("Standard Deviation of Numerical Features")
axes[1].set_xticks(range(len(std_deviation)))  # Set the tick positions
axes[1].set_xticklabels(std_deviation.index, rotation=90)

plt.tight_layout()
plt.show()

# 3. Covariance Analysis
# Covariance matrix calculates how numerical features change together
covariance_matrix = numerical_df.cov()

print("\nCovariance Matrix:\n")
print(covariance_matrix)

# Visualizing the Covariance Matrix (Heatmap)
# Ensure numbers appear with a maximum of 2 decimal places

plt.figure(figsize=(10, 6))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)  # fmt='.2f' limits decimals to 2
plt.title("Covariance Matrix of Numerical Features")
plt.show()

# Step 9
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Splitting data into train and test sets
X = df[['enginesize (cc)', 'horsepower (hp)', 'curbweight (lbs)']]  # Using only 3 features
y = df['price ($)']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple Linear Regression (Using just 'engine size' as an example)
simple_model = LinearRegression()
simple_model.fit(X_train[['enginesize (cc)']], y_train)  # Only one feature (engine size)
y_pred_simple = simple_model.predict(X_test[['enginesize (cc)']])

# Multiple Linear Regression (Using 3 selected features)
multiple_model = LinearRegression()
multiple_model.fit(X_train, y_train)  # Using only the 3 selected features
y_pred_multiple = multiple_model.predict(X_test)

# Calculating RMSE, MSE, and R¬≤ for both models
def evaluate_model(y_test, y_pred):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mse, r2

# Simple Linear Model Evaluation
rmse_simple, mse_simple, r2_simple = evaluate_model(y_test, y_pred_simple)

# Multiple Linear Model Evaluation
rmse_multiple, mse_multiple, r2_multiple = evaluate_model(y_test, y_pred_multiple)

# Printing the results
print("Simple Linear Regression - RMSE:", rmse_simple)
print("Simple Linear Regression - MSE:", mse_simple)
print("Simple Linear Regression - R^2:", r2_simple)

print("Multiple Linear Regression (3 features) - RMSE:", rmse_multiple)
print("Multiple Linear Regression (3 features) - MSE:", mse_multiple)
print("Multiple Linear Regression (3 features) - R^2:", r2_multiple)

# Plotting Simple Linear Regression
plt.figure(figsize=(10, 6))
plt.scatter(X_test['enginesize (cc)'], y_test, color='blue', label='Actual Price', alpha=0.5)
plt.plot(X_test['enginesize (cc)'], y_pred_simple, color='red', label='Predicted Price (Simple Model)')
plt.xlabel('Engine Size (cc)')
plt.ylabel('Price ($)')
plt.title('Simple Linear Regression: Price vs Engine Size')
plt.legend()
plt.show()

# Plotting Multiple Linear Regression
plt.figure(figsize=(8, 4))
plt.scatter(y_test, y_pred_multiple, color='green', label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label='Perfect Prediction')
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Multiple Linear Regression: Predicted vs Actual Price')
plt.legend()
plt.show()


from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Splitting data into train and test sets
X = df[['enginesize (cc)', 'horsepower (hp)', 'curbweight (lbs)']]  # Using only 3 features
y = df['price ($)']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial Regression (Degree 2 for quadratic relationship)
poly = PolynomialFeatures(degree=2)

# Transform the feature into polynomial features for both train and test sets
X_train_poly = poly.fit_transform(X_train[['enginesize (cc)']].values)  # Use .values to remove the feature names
X_test_poly = poly.transform(X_test[['enginesize (cc)']].values)  # Apply transformation to test data

# Fit polynomial regression model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Predict using the polynomial model
y_pred_poly = poly_model.predict(X_test_poly)

# Evaluate the model
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))  # Root Mean Squared Error
mse_poly = mean_squared_error(y_test, y_pred_poly)  # Mean Squared Error
r2_poly = r2_score(y_test, y_pred_poly)  # R¬≤

# Print the evaluation metrics
print("Polynomial Regression - RMSE:", rmse_poly)
print("Polynomial Regression - MSE:", mse_poly)
print("Polynomial Regression - R^2:", r2_poly)

# Visualize the Polynomial Regression
# Create a range of values for 'enginesize (cc)' to visualize the curve (smooth line)
x_range = np.linspace(X_train[['enginesize (cc)']].min(), X_train[['enginesize (cc)']].max(), 100).reshape(-1, 1)
x_range_poly = poly.transform(x_range)  # Transform to polynomial features

# Predict the price for the generated x_range
y_range_pred = poly_model.predict(x_range_poly)

# Plot the data
plt.figure(figsize=(10, 6))

# Scatter plot of the actual data points (training set)
plt.scatter(X_train[['enginesize (cc)']], y_train, color='blue', label='Training Data')

# Plot the polynomial regression curve (prediction)
plt.plot(x_range, y_range_pred, color='red', label='Polynomial Regression (Degree 2)', linewidth=2)

# Scatter plot for the test data (optional, if you want to show the test set predictions)
plt.scatter(X_test[['enginesize (cc)']], y_pred_poly, color='green', label='Test Data Predictions')

# Labels and title
plt.title('Polynomial Regression - Price vs Engine Size')
plt.xlabel('Engine Size (cc)')
plt.ylabel('Price ($)')
plt.legend()

# Show the plot
plt.show()

from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree

# Splitting data into train and test sets
X = df[['enginesize (cc)', 'horsepower (hp)', 'curbweight (lbs)']]  # Using only 3 features
y = df['price ($)']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Regression
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Predict using the decision tree model
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
mse_tree = mean_squared_error(y_test, y_pred_tree)  # Mean Squared Error
rmse_tree = np.sqrt(mse_tree)  # Root Mean Squared Error
r2_tree = r2_score(y_test, y_pred_tree)  # R¬≤

# Print the evaluation metrics
print("Decision Tree Regression - RMSE:", rmse_tree)
print("Decision Tree Regression - MSE:", mse_tree)
print("Decision Tree Regression - R^2:", r2_tree)

# Plotting predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_tree, color='blue', label='Predicted vs Actual')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Decision Tree Regression: Actual vs Predicted')
plt.legend()
plt.show()

# Visualizing the Decision Tree
plt.figure(figsize=(10, 6))
plot_tree(tree_model, filled=True, feature_names=X.columns, rounded=True, fontsize=10)
plt.title('Decision Tree Model Visualization')
plt.show()



# Splitting data into train and test sets
X = df[['enginesize (cc)', 'horsepower (hp)', 'curbweight (lbs)']]  # Using 3 features
y = df['price ($)']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store model results
results = {}

### SIMPLE LINEAR REGRESSION ###
simple_model = LinearRegression()
simple_model.fit(X_train[['enginesize (cc)']], y_train)
y_pred_simple = simple_model.predict(X_test[['enginesize (cc)']])

results['Simple Linear Regression'] = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_simple)),
    "MSE": mean_squared_error(y_test, y_pred_simple),
    "R¬≤": r2_score(y_test, y_pred_simple)
}

### MULTIPLE LINEAR REGRESSION ###
multiple_model = LinearRegression()
multiple_model.fit(X_train, y_train)
y_pred_multiple = multiple_model.predict(X_test)

results['Multiple Linear Regression'] = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_multiple)),
    "MSE": mean_squared_error(y_test, y_pred_multiple),
    "R¬≤": r2_score(y_test, y_pred_multiple)
}

### POLYNOMIAL REGRESSION (Degree 2) ###
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train[['enginesize (cc)']])
X_test_poly = poly.transform(X_test[['enginesize (cc)']])

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)

results['Polynomial Regression'] = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_poly)),
    "MSE": mean_squared_error(y_test, y_pred_poly),
    "R¬≤": r2_score(y_test, y_pred_poly)
}

### DECISION TREE REGRESSION ###
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

results['Decision Tree Regression'] = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_tree)),
    "MSE": mean_squared_error(y_test, y_pred_tree),
    "R¬≤": r2_score(y_test, y_pred_tree)
}

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T

# Find best and worst models based on RMSE (lower is better)
best_model = results_df['RMSE'].idxmin()
worst_model = results_df['RMSE'].idxmax()

print("\n### Model Performance Summary ###")
print(results_df)

print(f"\nüîπ Best Model: {best_model} (Lowest RMSE)")
print(f"üîª Worst Model: {worst_model} (Highest RMSE)")

# Explanation of best and worst models
if best_model == "Multiple Linear Regression":
    print("\n‚úÖ Multiple Linear Regression performed best because it considers multiple features, leading to a better fit.")
elif best_model == "Polynomial Regression":
    print("\n‚úÖ Polynomial Regression performed best because it captures non-linear relationships better than linear models.")
elif best_model == "Decision Tree Regression":
    print("\n‚úÖ Decision Tree Regression performed best because it can capture complex patterns and interactions.")
else:
    print("\n‚úÖ Simple Linear Regression performed best, which is rare but could indicate a strong linear relationship with engine size.")

if worst_model == "Decision Tree Regression":
    print("\n‚ö†Ô∏è Decision Tree Regression performed worst because it may have overfitted the training data, leading to poor generalization.")
elif worst_model == "Simple Linear Regression":
    print("\n‚ö†Ô∏è Simple Linear Regression performed worst because it only considers one feature, missing other important factors.")
elif worst_model == "Polynomial Regression":
    print("\n‚ö†Ô∏è Polynomial Regression performed worst, which may happen if the polynomial degree is too low or the model overfits.")

# Visualization: Bar Chart for RMSE, MSE, and R¬≤
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# RMSE Plot
axes[0].bar(results_df.index, results_df["RMSE"], color=['blue', 'green', 'red', 'purple'])
axes[0].set_title("RMSE Comparison")
axes[0].set_ylabel("RMSE (Lower is Better)")
axes[0].set_xticks(range(len(results_df.index)))  # Set tick locations
axes[0].set_xticklabels(results_df.index, rotation=45)  # Set tick labels

# MSE Plot
axes[1].bar(results_df.index, results_df["MSE"], color=['blue', 'green', 'red', 'purple'])
axes[1].set_title("MSE Comparison")
axes[1].set_ylabel("MSE (Lower is Better)")
axes[1].set_xticks(range(len(results_df.index)))  # Set tick locations
axes[1].set_xticklabels(results_df.index, rotation=45)  # Set tick labels

# R¬≤ Plot
axes[2].bar(results_df.index, results_df["R¬≤"], color=['blue', 'green', 'red', 'purple'])
axes[2].set_title("R¬≤ Score Comparison")
axes[2].set_ylabel("R¬≤ Score (Higher is Better)")
axes[2].set_xticks(range(len(results_df.index)))  # Set tick locations
axes[2].set_xticklabels(results_df.index, rotation=45)  # Set tick labels

plt.tight_layout()
plt.show()



FileNotFoundError: [Errno 2] No such file or directory: '/content/CarPrice_Assignment_.csv'