In [6]:
# Assignment4
# Task for dataset 5:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


df = pd.read_csv('ford_car_price.csv')

df.drop_duplicates(inplace=True)

imputer = SimpleImputer(strategy='mean')

df['price'] = imputer.fit_transform(df[['price']])
df['mileage'] = imputer.fit_transform(df[['mileage']])
df['tax'] = imputer.fit_transform(df[['tax']])
df['mpg'] = imputer.fit_transform(df[['mpg']])
df['engineSize'] = imputer.fit_transform(df[['engineSize']])

df['model'] = df['model'].fillna(df['model'].mode()[0])
df['transmission'] = df['transmission'].fillna(df['transmission'].mode()[0])
df['fuelType'] = df['fuelType'].fillna(df['fuelType'].mode()[0])

cat_columns = ['model', 'transmission', 'fuelType']
df = pd.get_dummies(df, columns=cat_columns, drop_first=True)

scaler = StandardScaler()
scaled_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
df[scaled_features] = scaler.fit_transform(df[scaled_features])

cor_mat = df.corr()
print("Correlation Matrix:\n", cor_mat)

df = df.drop(columns=['tax', 'engineSize'])

X = df.drop('price', axis=1)
y = df['price']

selector = SelectKBest(score_func=f_regression, k='all')
select_x = selector.fit_transform(X, y)

select_x_df = pd.DataFrame(select_x, columns=X.columns[selector.get_support()])

X_train, X_test, y_train, y_test = train_test_split(select_x_df, y, test_size=0.2, random_state=42)

print("NEW Data:")
print(X_train.head())

final_df = pd.concat([select_x_df, y.reset_index(drop=True)], axis=1)

final_df.to_csv('new_car_data.csv', index=False)


# The End

Correlation Matrix:
                                   year     price   mileage       tax  \
year                          1.000000  0.635715 -0.708690  0.297086   
price                         0.635715  1.000000 -0.530483  0.405814   
mileage                      -0.708690 -0.530483  1.000000 -0.257402   
tax                           0.297086  0.405814 -0.257402  1.000000   
mpg                          -0.021702 -0.346263  0.117697 -0.502198   
engineSize                   -0.137582  0.411451  0.214692  0.185439   
model_ C-MAX                 -0.078755 -0.088128  0.046377 -0.118497   
model_ EcoSport               0.084409  0.010959 -0.091369  0.096413   
model_ Edge                   0.024422  0.237867 -0.007716  0.075683   
model_ Escort                -0.076181 -0.014665  0.010271  0.018326   
model_ Fiesta                -0.021439 -0.333063 -0.065318 -0.152754   
model_ Focus                  0.037677  0.112554  0.011668 -0.020449   
model_ Fusion                -0.128589 -0.0

# **ASSIGNMENT-05**

In [7]:
# Assignment-05

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

df= pd.read_csv('new_car_data.csv')
X = df.drop('price',axis=1)
y= df['price']

print("**Linear Regression")

X_train,X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_linear =linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_linear)
rmse = np.sqrt(mse)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

print("**Logistic Regression")

price_median= df['price'].median()
df['price_class'] = np.where(df['price'] > price_median, 1, 0)

X_logistic = df.drop(['price', 'price_class'], axis=1)
y_logistic = df['price_class']

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_logistic, y_logistic, test_size=0.2, random_state=42)

logistic_model =LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_log, y_train_log)

y_pred_log= logistic_model.predict(X_test_log)

accu= accuracy_score(y_test_log, y_pred_log)
pre= precision_score(y_test_log, y_pred_log)
recall= recall_score(y_test_log, y_pred_log)
f1= f1_score(y_test_log, y_pred_log)
conf_mat= confusion_matrix(y_test_log, y_pred_log)

print(f"Accuracy:{accu}")
print(f"Precision: {pre}")
print(f"Recall: {recall}")
print(f"F1-Score:{f1}")
print("Confusion Matrix:")
print(conf_mat)

**Linear Regression
MSE: 4161391.548511408
RMSE: 2039.94890830908
**Logistic Regression
Accuracy:0.9090653943306203
Precision: 0.9194395796847635
Recall: 0.8943781942078365
F1-Score:0.9067357512953368
Confusion Matrix:
[[1664  138]
 [ 186 1575]]
