<a href="https://colab.research.google.com/github/Brevex/US-Used-Cars-Data-Analysis/blob/main/US_Used_Cars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dask
!pip install opendatasets

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import opendatasets as od
import matplotlib.pyplot as plt

from dask import dataframe as dd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [None]:
od.download("https://www.kaggle.com/datasets/ananaymital/us-used-cars-dataset")

In [None]:
dask_df = dd.read_csv('/content/us-used-cars-dataset/used_cars_data.csv', blocksize="64MB",
       usecols=[
          'engine_cylinders',
          'fuel_tank_volume',
          'horsepower',
          'make_name',
          'mileage',
          'model_name',
          'power',
          'price',
          'torque',
          'transmission',
          'year'
       ])

df = dask_df.compute()
df

In [None]:
df.info()

In [None]:
count = 0
nan_col = []
col_percent = []

for col in df.columns:
    nan_qntt = df[col].isna().sum()

    if nan_qntt > 0:

        percent = nan_qntt/df.shape[0]*100
        print('A coluna {} tem {}% NaN'.format(col,[percent]))
        count += 1
        nan_col += [col]
        col_percent += [(col,percent)]

print(count)
print(nan_col)

In [None]:
df.isna().sum()

In [None]:
ignored_nan_df = df.dropna()
ignored_nan_df

In [None]:
df_no_mileage = df.drop(axis=1, labels='mileage')
ignoring_nan_and_mileage_df = df_no_mileage.dropna()
ignoring_nan_and_mileage_df
ignoring_nan_and_mileage_df.info()

In [None]:
df_tratando = ignoring_nan_and_mileage_df.drop_duplicates()

# Removing strings from data
df_tratando['fuel_tank_volume'] = df_tratando['fuel_tank_volume'].replace('--', np.nan)
df_tratando['fuel_tank_volume'] = df_tratando['fuel_tank_volume'].str.replace(' gal', '').astype(float)

df_tratando = df_tratando.drop('power', axis = 1)
df_tratando = df_tratando.drop('torque', axis = 1)

df_final = df_tratando.dropna()

In [None]:
df_final.head(10)

# Importance score for each variable

In [None]:
# Defining the input variables (X) and the target variable (y)
X = df_final.drop("price", axis=1)
y = df_final["price"]

# Encoding categorical variables using LabelEncoder
categorical_cols = ["engine_cylinders", "make_name", "model_name", "transmission"]

for col in categorical_cols:

    encoder = LabelEncoder()
    X[col] = encoder.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating an instance of the Random Forest model
model = RandomForestRegressor()

# Training the model with the training data
model.fit(X_train, y_train)

# Making predictions with the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
sorted_idx = model.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], model.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
plt.show()

#Horsepower x Engine Chart

In [None]:
# Grouping the data by 'engine_cylinders' and calculate the average of 'horsepower'
mean_horsepower = df.groupby('engine_cylinders')['horsepower'].mean()

# Increasing the chart width
plt.figure(figsize=(12, 8))

# Creating a bar chart with average 'horsepower' by type of 'engine_cylinders'
mean_horsepower.plot(kind='bar')

# Customizing the chart
plt.title("Average Horsepower by Engine Cylinders")
plt.xlabel("Engine Cylinders")
plt.ylabel("Average Horsepower")

plt.show()

#Average Horsepower score per engine

In [None]:
# Transforming categorical variables into numerical ones
label_encoder = LabelEncoder()

df_final['engine_cylinders_encoded'] = label_encoder.fit_transform(df_final['engine_cylinders'])
df_final['horsepower'] = pd.to_numeric(df_final['horsepower'])

# Calculating the average horsepower for each engine cylinders
grouped_data = df_final.groupby('engine_cylinders_encoded')['horsepower'].mean().reset_index()

# Normalizing the average of 'horsepower' between 0 and 1
min_horsepower = grouped_data['horsepower'].min()
max_horsepower = grouped_data['horsepower'].max()
grouped_data['score'] = (grouped_data['horsepower'] - min_horsepower) / (max_horsepower - min_horsepower)

# Sorting the results by score in ascending order
grouped_data = grouped_data.sort_values(by='score')

# Getting the original names of engine cylinders
engine_cylinders_names = label_encoder.inverse_transform(grouped_data['engine_cylinders_encoded'])

# Creating a DataFrame with the data
df_scores = pd.DataFrame({'engine_cylinders': engine_cylinders_names, 'score': grouped_data['score']})

# Sorting the DataFrame by score in descending order and resetting index
df_scores = df_scores.sort_values('score', ascending=False).reset_index(drop=True)

# Creating the bar plot
plt.figure(figsize=(10, 6))
plt.barh(df_scores['engine_cylinders'], df_scores['score'], color='steelblue')
plt.xlabel('Score')
plt.ylabel('Engine Cylinders')
plt.title('Score by Engine Cylinders')
plt.show()

#Price prediction

In [None]:
X = df_final.drop('price', axis=1)
y = df_final['price']

label_encoder = LabelEncoder()

for col in X.columns:
   if X[col].dtype == 'object':
       X[col] = label_encoder.fit_transform(X[col])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
sns.scatterplot(x=y_test, y=y_pred)

sns.regplot(x=y_test, y=y_pred)

plt.xlabel('Real Price')
plt.ylabel('Predicted Price')
plt.title('Real vs Predicted Price')
plt.show()
