In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, RandomizedSearchCV
from category_encoders import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline, make_pipeline
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
import seaborn as sns
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_selection import mutual_info_regression

In [None]:
df = pd.read_csv(r"C:\Users\Admin\Desktop\All files\csv files\pak wheel\Pakwheel Data.csv")
df.head()


In [None]:
df.shape

In [None]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df['Title'].nunique()

In [None]:
df['Title'][:5]

<h1>Data Cleaning<h1/>

In [None]:
df['CarName'] = df['Title'].str.split('\d{4}').str[0].str.strip()

In [None]:
df.head()

In [None]:
df['CarName'].nunique()

In [None]:
df.drop(columns = 'Title', inplace = True)

In [None]:
pattern = r'\s*cc$'

df['CC'] = df['CC'].str.replace(pattern, '', regex = True)

In [None]:
df.head()

In [None]:
df['Km_Driven'] = df['Km_Driven'].str.replace('km', '', regex = True)

In [None]:
df['Km_Driven'] = df['Km_Driven'].str.replace(',', '', regex = True)

In [None]:
df.head()

In [None]:
df['CarName'][:5]

In [None]:
df.info()

In [None]:
df['Engine_type'].value_counts()

<h1>Data visualization</h1>

In [None]:
sns.countplot(data=df, x='Engine_type')
plt.xlabel("Fuel Type")
plt.ylabel("No of Cars")
plt.title("Count of cars by fuel type")
plt.show();

In [None]:
df = df[~(df['Engine_type']=='Electric')]

In [None]:
a = df['Transmission'].value_counts()

In [None]:
df['CC'].apply(lambda x: 1 if 'k' in x else 0 ).value_counts()

In [None]:
inds = df['CC'].apply(lambda x: None if x=='' else 1).dropna().index

df = df.loc[inds].reset_index().drop(columns = ['index'])

In [None]:
df['Km_Driven'].apply(lambda x: x if x==',' else '')

In [None]:
inds = df['CC'].apply(lambda x: None if 'k' in x else 1 ).dropna().index

df = df.loc[inds].reset_index().drop(columns=['index'])

In [None]:
plt.pie(a.values, labels = a.index,  autopct='%1.1f%%');

In [None]:
df['Model'] = df['Model'].astype("int")
df['CC'] = df['CC'].astype("int")
df['Km_Driven'] = df['Km_Driven'].astype("int")

In [None]:
sns.scatterplot(x = 'Km_Driven', y = 'prices', data = df);

In [None]:
sns.scatterplot(x = 'Model', y = 'prices', data = df);

In [None]:
sns.scatterplot(x = 'CC', y = 'prices', data = df);

In [None]:
df.plot(kind = 'scatter', x = 'Km_Driven', y = 'prices')

In [None]:
df.plot(kind = 'hexbin', x = 'Km_Driven', y = 'prices')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
missing_values_count = df.isnull().sum()


plt.figure(figsize=(10, 6))
missing_values_count.plot(kind='bar', color='skyblue')
plt.title('Count of Missing Values for Each Column')
plt.xlabel('Columns')
plt.ylabel('Count of Missing Values')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show();

In [None]:
sns.boxplot(data=df, x = 'Engine_type', y = 'prices');

In [None]:
sns.boxplot(data=df, x = 'Transmission', y = 'prices');

In [None]:
df['CC'].nunique()

In [None]:
df.head()

<h1>Label Encoding</h1>

In [None]:
label_encoder = LabelEncoder()
cat_cols = df[['Transmission', 'Engine_type', 'CarName']]

label_encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}

for col in cat_cols:
    df[col] = label_encoders[col].transform(df[col])

with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['prices'] = df['prices'].fillna(round(df['prices'].mean(), 2))

In [None]:
df.head()

In [None]:
X = df.drop(columns = 'prices')
y = df['prices']

<h1>Mutual info Regression</h1>

In [None]:
mi_scores = mutual_info_regression(X, y)

mi_scores_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})

mi_scores_df_sorted = mi_scores_df.sort_values(by='MI Score', ascending=True)
mi_scores_df_sorted

In [None]:

plt.figure(figsize=(10, 8))
bars = plt.barh(mi_scores_df_sorted['Feature'], mi_scores_df_sorted['MI Score'], color='skyblue')

plt.title('Mutual Information Scores of Features', fontsize=16)
plt.xlabel('MI Score', fontsize=14)
plt.ylabel('Feature', fontsize=14)


for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{bar.get_width():.2f}', 
             va='center', ha='left', fontsize=12, color='black')


plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()


plt.show();

In [None]:
scaler = StandardScaler().fit(X[['Model', 'CC', 'Km_Driven']])


X[['Model', 'CC', 'Km_Driven']] = scaler.transform(X[['Model', 'CC', 'Km_Driven']])
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
X.head()

When i kept test size 20% i got accuracy 79 but on 0.3 i got 85

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3)

In [None]:
X_train['Engine_type'].value_counts()

<h1>RandomForestRegressor</h1>

In [None]:
forest = RandomForestRegressor(random_state=42)
forest.fit(X_train, y_train)
round(forest.score(X_test, y_test), 2) * 100

In [None]:
forest_pred = forest.predict(X_test)
forest_pred

In [None]:
acc_train = forest.score(X_train, y_train)
acc_test = forest.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 4))
print("Test Accuracy:", round(acc_test, 4))

In [None]:
importances = forest.feature_importances_
importances

In [None]:
indices = np.argsort(importances)[::-1]
indices

In [None]:
names = [X_train.columns[i] for i in indices]
names

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), names, rotation=90)
plt.show()

<h1>GradientBoostingRegressor</h1>

In [None]:
clr2 = GradientBoostingRegressor()

In [None]:
params = {
    'n_estimators' : range(25, 200, 25),
    'max_depth' : range(10, 100, 10)
}

In [None]:
model_gredb = GridSearchCV(
    clr2,
    verbose = 1,
    cv = 5,
    n_jobs = -1,
    param_grid = params
)

In [None]:
model_gredb.fit(X_train, y_train)

In [None]:
gred_pred = model_gredb.predict(X_test)
gred_pred

In [None]:
Mae_gred = mean_absolute_error(y_test, gred_pred)
print("Mean Absolute Error:" , Mae_gred)

In [None]:
r23 = r2_score(y_test, gred_pred)
print('R-squared Score:', round(r23, 2))

In [None]:
acc_train = model_gredb.score(X_train, y_train)
acc_test = model_gredb.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 4))
print("Test Accuracy:", round(acc_test, 4))

<h1>
    Cross Validation
</h1>

In [None]:
my_pipeline = Pipeline(steps = [('model', RandomForestRegressor(random_state=42, n_estimators=50))])

In [None]:
cv_acc_scores = cross_val_score(my_pipeline, X_train, y_train, cv = 5, n_jobs = -1)
print(cv_acc_scores.mean())

<h1>
    XGBRegressor
</h1>

In [None]:
model_xgb = XGBRegressor(n_estimators = 1000, random_state = 42)

In [None]:
model_xgb.fit(X_train, y_train, verbose = False, early_stopping_rounds = 5, eval_set = [(X_train, y_train)])

In [None]:
xgb_pred = model_xgb.predict(X_test)
xgb_pred

In [None]:
Mae_gred = mean_absolute_error(y_test, xgb_pred)
print("Mean Absolute Error:" , Mae_gred)

In [None]:
r23 = r2_score(y_test, xgb_pred)
print('R-squared Score:', round(r23, 2))

In [None]:
KM__Driven = int(input('Enter KM Driven: '))
CC__ = int(input('Enter CC: '))
Model__ = int(input('Enter Model: '))
Engine_type__ = input('Enter engine type: ')
car_name__ = input('Enter car name: ')
Transmission__ = input('Enter transmission: ')

new_data1 = {
    'Model': Model__,
    'CC': CC__,
    'Engine_type': Engine_type__,
    'Transmission': Transmission__,
    'Km_Driven': KM__Driven,
    'CarName': car_name__
    
}


new_data = pd.DataFrame(new_data1, index=[0])
new_data

In [None]:
categorical_cols = ['Engine_type', 'Transmission', 'CarName']
for col in categorical_cols:
    le = label_encoders[col]
    if new_data[col][0] not in le.classes_:
        le.classes_ = np.append(le.classes_, new_data[col][0])
    new_data[col] = le.transform(new_data[col])

numerical_cols = ['Model', 'CC', 'Km_Driven']
new_data[numerical_cols] = scaler.transform(new_data[numerical_cols])
    
    
new_data = new_data[['Model', 'CC', 'Engine_type', 'Transmission', 'Km_Driven', 'CarName']]
    
    
prediction = forest.predict(new_data)

print(prediction)

<h1>Saving the model</h1>

In [None]:
pickle.dump(forest, open('model_car_price_predict.pkl', 'wb'))

In [None]:
with open("model_car_price_predict.pkl", 'rb') as f:
    model = pickle.load(f)

y_test_pred = model.predict(X_test)

y_test_pred = pd.Series(y_test_pred, index = X_test.index, name = 'bankrupt')
y_test_pred

In [None]:
new_data['Model_Sc'] = scaler.fit_transform(new_data['Model'].values.reshape(-1, 1))
new_data['CC_Sc'] = scaler.fit_transform(new_data['CC'].values.reshape(-1, 1))
new_data['Km_Driven_Sc'] = scaler.fit_transform(new_data['Km_Driven'].values.reshape(-1, 1))

In [None]:
new_data['Transmission_encoded'] = label_encoder.fit_transform(new_data['Transmission'])
new_data['Engine_type_encoded'] = label_encoder.fit_transform(new_data['Engine_Type'])
new_data['CarName_encoded'] = label_encoder.fit_transform(new_data['CarName'])

In [None]:
prediction_on_new_data = model_tree.predict(new_data)
prediction_on_new_data

In [None]:
param_distributions = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [None]:
random_search.fit(X_train, y_train)
round(random_search.score(X_test, y_test), 2) * 100

In [None]:
model_forest = RandomForestRegressor(random_state = 42)
random_search = RandomizedSearchCV(estimator=model_forest, param_distributions=param_distributions, 
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [None]:
#round(grid_search.score(X_test, y_test), 2) * 100

In [None]:
#grid_search = GridSearchCV(estimator=forest, param_grid = param_grid,  n_jobs = -1, cv = 5)
#grid_search.fit(X_train, y_train) 

In [None]:
#param_grid = {
 #   'n_estimators': [100, 200, 300], 
  #  'max_depth': [5, 10, 15],          
   # 'min_samples_split': [2, 5, 10],   
    #'min_samples_leaf': [1, 2, 4]       
#}

#forest = RandomForestRegressor(random_state=42)