In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_text
from sklearn.tree import plot_tree
from sklearn.datasets import load_iris
from sklearn.preprocessing import label_binarize
import pickle
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/cars_2010_2020.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df["Make"].value_counts()

In [None]:
df["Model"].value_counts()

In [None]:
df['Year'].value_counts()

In [None]:
df["Engine Size (L)"].value_counts()

In [None]:
df["Fuel Type"].value_counts()

In [None]:
df["Price (USD)"].value_counts()

In [None]:
df.columns

In [None]:
df_genre_sales = df.groupby('Make')['Price (USD)'].sum().reset_index()
fig = px.pie(df_genre_sales, values='Price (USD)', names='Make', height=400)
fig.update_layout(
    title_text="Distribution of Make by Price",
    showlegend=True
)

fig.show()


In [None]:
df_genre_sales = df.groupby('Year')['Price (USD)'].sum().reset_index()
fig = px.pie(df_genre_sales, values='Price (USD)', names='Year', height=400)
fig.update_layout(
    title_text="Distribution of Year by Price",
    showlegend=True
)

fig.show()


In [None]:
fig = px.histogram(df,x="Engine Size (L)" , y="Price (USD)" ,  color_discrete_sequence=['red'], nbins=10, height=500)
fig.update_layout(
    title_text="Distribution of Price by Engine Size (L)",
    xaxis_title="Engine Size (L)",
    yaxis_title="Price",
)
fig.show()

In [None]:
fig = px.histogram(df,x="Model" , y="Price (USD)" ,  color_discrete_sequence=['red'], nbins=10, height=500)
fig.update_layout(
    title_text="Distribution of Price by Model",
    xaxis_title="Model",
    yaxis_title="Price"
)
fig.show()

In [None]:
df_genre_sales = df.groupby('Fuel Type')['Price (USD)'].sum().reset_index()
fig = px.pie(df_genre_sales, values='Price (USD)', names='Fuel Type', height=400)
fig.update_layout(
    title_text="Distribution of Fuel Type by Price",
    showlegend=True
)

fig.show()


In [None]:
df_pivot = df.pivot_table(index='Year', columns='Make', values='Price (USD)', aggfunc='sum').reset_index()
fig = px.area(df_pivot, x='Year', y=df_pivot.columns[1:], height=600)
fig.update_layout(
    title_text="Distribution of Price by Make Over the Years",
    xaxis_title="Year",
    yaxis_title="Price (USD)",
    legend_title="Make"
)

fig.show()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
features_to_encode = ['Make', 'Model','Engine Size (L)', 'Fuel Type', 'Price (USD)']
encoder = LabelEncoder()
for feature in features_to_encode:
    df[feature] = encoder.fit_transform(df[feature])

In [None]:
df.info()

In [None]:
feature_cols =['Make', 'Model', 'Year', 'Engine Size (L)', 'Fuel Type']
X = df[feature_cols]
y = df['Price (USD)']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), feature_cols)
    ])

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('model', LinearRegression())
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
mean_y = np.mean(y_test)
total_variance = np.sum((y_test - mean_y) ** 2)

# Calculate residual sum of squares (RSS)
residual_sum_of_squares = np.sum((y_test - y_pred) ** 2)

# Calculate explained variance as a percentage
explained_variance_percentage = (1 - residual_sum_of_squares / total_variance) * 100

print(f'Explained Variance Percentage: {explained_variance_percentage:.2f}%')

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)

In [None]:
y_predict= lg.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_predict)
accuracy_log_reg=accuracy*100
print(accuracy_log_reg)

In [None]:
y_prob = lg.predict_proba(X_test)[:, 1]

In [None]:
model = SVC()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

KeyboardInterrupt: 

In [None]:
accuracySVM = accuracy_score(y_test, y_pred)*100
print('SVM Accuracy: ', accuracySVM)

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracyKNN = accuracy_score(y_test, y_pred)*100
print('KNN Accuracy: ', accuracyKNN)

In [None]:
model = SVR()
model.fit(X_train, y_train)

In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracySVR = 100 - mape

print(f'Accuracy: {accuracySVR:.2f}%')

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracyLN = 100 - mape
print(f'Accuracy: {accuracyLN:.2f}%')

In [None]:
model = SVR()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracySVR = 100 - mape

print(f'Accuracy: {accuracySVR:.2f}%')

In [None]:
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

In [None]:
y_pred = gb_model.predict(X_test)

In [None]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracyGradientBoostingRegressor = 100 - mape

print(f'Accuracy: {accuracyGradientBoostingRegressor:.2f}%')

In [None]:
lgb_reg = lgb.LGBMRegressor()
lgb_reg.fit(X_train, y_train)

In [None]:
y_pred = lgb_reg.predict(X_test)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

accuracyLGBMRegressor = 100 - mean_absolute_percentage_error(y_test, y_pred)
print(f"Accuracy: {accuracyLGBMRegressor:.2f}%")

In [None]:
def compare_algorithms_accuracy(df, feature_cols, test_size=0.2, random_state=42):
    allAcc = {'Logistic Regression Accuracy': accuracy_log_reg,
      'polynominal regression':explained_variance_percentage,
      'KNN accuracy':accuracyKNN,
      'SVM accuracy':accuracySVM,
      'accuracy SVR':accuracySVR,
      'LinearRegression aaccuracy': accuracyLN,
      'GradientBoostingRegressor accuracy':accuracyGradientBoostingRegressor,
      'LGBMRegressor accuracy':accuracyLGBMRegressor,}

    highest_accuracy = max(allAcc, key=allAcc.get)
    return f"The algorithm with the highest accuracy is: {highest_accuracy} with an accuracy of {allAcc[highest_accuracy]:.2f}%"
result = compare_algorithms_accuracy(df, feature_cols)
print(result)

In [None]:
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_clf, file)