import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from converter import columnsMapper, convertX


In [None]:
df = pd.read_csv('car_sales_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


X_columns = ['Gender', 'Color']
y_column = 'Model'

X_df = df[X_columns].copy()
y_series = df[y_column].copy()

print(f"Features (X): {X_columns}")
print(f"Target (y): {y_column}")
print(f"\nUnique values in Gender: {df['Gender'].unique()}")
print(f"Unique values in Color: {df['Color'].unique()}")
print(f"Unique values in Model: {df['Model'].unique()}")


In [None]:
mapper_X = columnsMapper(X_df, X_columns)
X_encoded = convertX(X_df, mapper_X)

mapper_y = columnsMapper(pd.DataFrame({y_column: y_series}), [y_column])
y_encoded = convertX(pd.DataFrame({y_column: y_series}), mapper_y)[y_column]

print("Encoded features (X):")
print(X_encoded.head())
print(f"\nEncoded target (y) sample: {y_encoded.head().tolist()}")


X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully!")


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")


In [None]:
model_filename = 'car_model_predictor.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to: {model_filename}")

mapper_filename = 'car_model_mappers.joblib'
joblib.dump({'X': mapper_X, 'y': mapper_y}, mapper_filename)
print(f"Mappers saved to: {mapper_filename}")


hyundai_df = df[df['Company'] == 'Hyundai'].copy()
print(f"Number of rows: {len(hyundai_df)}")
hyundai_df.head()


In [None]:
toyota_expensive_df = df[(df['Company'] == 'Toyota') & (df['Price'] > 40000)].copy()
print(f"Number of rows: {len(toyota_expensive_df)}")
toyota_expensive_df.head()


model_counts = df['Model'].value_counts()
top_3_models = model_counts.head(3)

print("3 most popular car models:")
for model_name, count in top_3_models.items():
    print(f"  {model_name}: {count} occurrences")

top_3_models
