In [None]:
#Task 1.1: Data Loading and Initial Exploration

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import seaborn as sns


#1.2 Read the CSV file from the URL

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PDAI LIthan/car__1_.csv')

# Print the DataFrame
print(df)

#1.3 Display the first 5 rows of the dataset to get an initial overview

df.head(5)

# =-1 all except last column
x = df.iloc[ :,:-1 ]
y = df.iloc[ :,-1 ]

#Task 2: Data Splitting
#2.1 Split the dataset into training and testing sets using the train_test_split function from sklearn
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.30, random_state= 42)


#2.2 Display basic function about the training set using the info method
x_train.info()

#Task 3: Exploratory Data Analysis (EDA)

#3.1 Remove the index column as it seems redundant.

x_train.drop(['Unnamed: 0'], axis = 1, inplace = True)
x_test.drop(['Unnamed: 0'], axis = 1, inplace = True)

#3.2 Explore the distribution of car manufacturers in the dataset. Extract the manufacturer from the 'Name' column and create a new column named 'Manufacturer'.
str_split_train = x_train["Name"].str.split(" ", expand = True)
str_split_test = x_test["Name"].str.split(" ", expand = True)

x_train['Manufacturer'] = str_split_train[0]
x_test['Manufacturer'] = str_split_test[0]

#3.3 Visualize the count of cars based on manufacturers using a countplot.

sns.countplot (x = 'Manufacturer', data = x_train)
plt.xticks(rotation = 90)
plt.xlabel("Manufacturer of Car")
plt.ylabel("Count of cars for each Manufacturer")
plt.title("Count of cars based on each Manufacturer")

#3.4 Drop the 'Name' column as it is no longer needed.

x_train.drop(["Name"], axis = 1, inplace = True)
x_test.drop(["Name"], axis = 1, inplace = True)

#3.5 Remove the 'Location' column as it is not expected to significantly influence car prices.

x_train.drop(["Location"], axis = 1, inplace = True)
x_test.drop(["Location"], axis = 1, inplace = True)

#4.2 Scale the 'Kilometers_Driven' column using MinMaxScaler.

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['Kilometers_Driven'] = scaler.fit_transform(df[['Kilometers_Driven']])

#4.3 Extract the numerical values from the 'Mileage' column and handle missing values.

mileage_train = x_train['Mileage'].str.split(" ", expand = True)
mileage_test = x_test['Mileage'].str.split(" ", expand = True)

x_train['Mileage'] = pd.to_numeric(mileage_train[0], errors ='coerce')
x_test['Mileage'] = pd.to_numeric(mileage_test[0], errors ='coerce')

print(sum(x_train['Mileage'].isnull()))
print(sum(x_test['Mileage'].isnull()))

#fill the missing value by using mean()
x_train['Mileage'].fillna(x_train['Mileage'].astype("float64").mean(), inplace = True)
x_test['Mileage'].fillna(x_test['Mileage'].astype("float64").mean(), inplace = True)

#4.4 Process the 'Engine', 'Power', and 'Seats' columns by removing units, converting to numeric, and handling missing values.

engine_train = x_train['Engine'].str.split(" ", expand = True)
engine_test = x_test['Engine'].str.split(" ", expand = True)

power_train = x_train['Power'].str.split(" ", expand = True)
power_test = x_test['Power'].str.split(" ", expand = True)

x_train['Engine'] = pd.to_numeric(engine_train[0], errors ='coerce')
x_test['Engine'] = pd.to_numeric(engine_test[0], errors ='coerce')

x_train['Power'] = pd.to_numeric(power_train[0], errors ='coerce')
x_test['Power'] = pd.to_numeric(power_test[0], errors ='coerce')

x_train['Engine'].fillna(x_train['Engine'].astype("float64").mean(), inplace = True)
x_test['Engine'].fillna(x_test['Engine'].astype("float64").mean(), inplace = True)

x_train['Power'].fillna(x_train['Power'].astype("float64").mean(), inplace = True)
x_test['Power'].fillna(x_test['Power'].astype("float64").mean(), inplace = True)

x_train['Seats'].fillna(x_train['Seats'].astype("float64").mean(), inplace = True)
x_test['Seats'].fillna(x_test['Seats'].astype("float64").mean(), inplace = True)

#4.5 Drop the 'New_Price' column due to a high number of missing values.

x_train.drop(["New_Price"], axis = 1, inplace = True)
x_test.drop(["New_Price"], axis = 1, inplace = True)

#Task 4: Feature Engineering
# Convert categorical columns ('Fuel_Type', 'Transmission', 'Owner_Type') into dummy variables.

x_train = pd.get_dummies(x_train , columns = ["Fuel_Type", "Transmission", "Owner_Type", "Manufacturer"], drop_first = True)

x_test = pd.get_dummies(x_test , columns = ["Fuel_Type", "Transmission", "Owner_Type", "Manufacturer"], drop_first = True)

missing_cols = set(x_train.columns) - set(x_test.columns)
for col in missing_cols:
  x_test[col] = 0
x_test = x_test[x_train.columns]

#Task 5: Data Processing
#5.1 Normalize Numerical Features.

#5.2 Scale the datasets using StandardScaler.

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

#Task 6: Model Training and Evaluation
#6.1 Train a Linear Regression model on the preprocessed training data.

lir = LinearRegression()
lir.fit(x_train, y_train)
y_pred = lir.predict(x_test)

#6.2 Evaluate the performance of the Linear Regression model using the R-squared score on the test set.
from sklearn.metrics import r2_score

r2_score( y_test, y_pred)
print ("The r2 score of Linear Regression model is", r2_score(y_test, y_pred))

#6.3 Train a Random Forest Regressor model with 100 estimators on the preprocessed training data.
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor ()
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)

#6.4 Evaluate the performance of the Random Forest Regressor model using the R-squared score on the test set.
print ("The r2 score of Random Forest Regressor is", r2_score(y_test, y_pred))
