# Part 2 - Tal Davidi 
## 208871376
#### https://github.com/DavidiTal

### Step 1 - Data Preparation :

In [1]:
import pandas as pd
dataset = pd.read_csv('dataset.csv')

In [2]:
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

def prepare_data(file_path):
    dataset = pd.read_csv(file_path)

    # Step 1 - Order the Data

    ### Duplicates and Empty
    dataset = dataset.drop_duplicates()

    # Remove columns with many missing values
    columns_to_drop = ['Supply_score', 'Pic_num', 'Color', 'Area', 'City', 'Test', 'Description']
    dataset = dataset.drop(columns=columns_to_drop)

    # Convert date columns to datetime format
    dataset['Repub_date'] = pd.to_datetime(dataset['Repub_date'], format='%d/%m/%Y', errors='coerce')
    dataset['Cre_date'] = pd.to_datetime(dataset['Cre_date'], format='%d/%m/%Y', errors='coerce')

    # Calculate days since for the date columns
    current_date = datetime.now()
    dataset['Repub_date_days_since'] = (current_date - dataset['Repub_date']).dt.days
    dataset['Cre_date_days_since'] = (current_date - dataset['Cre_date']).dt.days

    # Fill missing values for 'Repub_date_days_since' and 'Cre_date_days_since' using the mean value within groups
    dataset['Repub_date_days_since'] = dataset.groupby(['manufactor', 'model', 'Year'])['Repub_date_days_since'].transform(lambda x: x.fillna(x.mean() if x.notna().sum() > 0 else dataset['Repub_date_days_since'].mean()))
    dataset['Cre_date_days_since'] = dataset.groupby(['manufactor', 'model', 'Year'])['Cre_date_days_since'].transform(lambda x: x.fillna(x.mean() if x.notna().sum() > 0 else dataset['Cre_date_days_since'].mean()))

    # Drop the original date columns as they are no longer needed
    dataset = dataset.drop(columns=['Repub_date', 'Cre_date'])
    
    
    ### Manufactor Column
    dataset['manufactor'] = dataset['manufactor'].str.replace('Lexsus', 'לקסוס', regex=True)

    ### Model Column
    for index, row in dataset.iterrows():
        manufactor = str(row['manufactor'])
        model = str(row['model'])
        if manufactor in model:
            dataset.at[index, 'model'] = model.replace(manufactor, '').strip()
    dataset['model'] = dataset['model'].str.strip()
    dataset['model'] = dataset['model'].str.extract(r'(\w+\s\w+|\w+\s\w+\s\w+|\w+)')
    dataset['model'] = dataset['model'].str.replace('CIVIC', 'סיוויק', regex=True)
    dataset['model'] = dataset['model'].str.replace('JAZZ', 'ג`אז', regex=True)
    dataset['model'] = dataset['model'].str.replace('ACCORD', 'אקורד', regex=True)
    dataset.loc[dataset['manufactor'] == 'הונדה', 'model'] = dataset.loc[dataset['manufactor'] == 'הונדה', 'model'].str.replace('INSIGHT', 'אינסייט', regex=True)
    dataset.loc[(dataset['manufactor'] == 'הונדה') & (dataset['model'] == 'האצ`בק'), 'model'] = 'סיוויק האצ`בק'
    dataset.loc[dataset['manufactor'] == 'הונדה', 'model'] = dataset.loc[dataset['manufactor'] == 'הונדה', 'model'].str.replace("האצ'בק", "האצ`בק", regex=True)
    dataset.loc[dataset['manufactor'] == 'הונדה', 'model'] = dataset.loc[dataset['manufactor'] == 'הונדה', 'model'].str.replace("ג'אז", "ג`אז", regex=True)
    dataset['model'] = dataset['model'].str.replace('אונסיס', 'אוונסיס', regex=True)
    dataset['model'] = dataset['model'].str.replace('קאונטרימן', 'קאנטרימן', regex=True)
    dataset['model'] = dataset['model'].str.replace('one', 'ONE', regex=True)
    dataset['model'] = dataset['model'].str.replace('מיטו / MITO', 'מיטו', regex=True)
    dataset['model'] = dataset['model'].str.replace('Taxi', '', regex=True)

    ### Gear Column
    dataset['Gear'] = dataset['Gear'].replace('אוטומטי', 'אוטומטית')
    dataset['Gear'] = dataset.groupby(['manufactor', 'model'], group_keys=False)['Gear'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'אוטומטי'))

    ### Engine_type Column
    dataset['Engine_type'] = dataset['Engine_type'].replace({'היבריד': 'היברידי', 'טורבו דיזל': 'דיזל'})
    dataset['Engine_type'] = dataset.groupby(['manufactor', 'model'], group_keys=False)['Engine_type'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'לא ידוע'))

    ### Prev_ownership and Curr_ownership
    dataset['Prev_ownership'] = dataset.groupby(['manufactor', 'model'], group_keys=False)['Prev_ownership'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'לא ידוע'))
    dataset['Prev_ownership'] = dataset['Prev_ownership'].replace(['None', 'לא מוגדר'], 'לא ידוע')
    dataset['Curr_ownership'] = dataset.groupby(['manufactor', 'model'], group_keys=False)['Curr_ownership'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'לא ידוע'))
    dataset['Curr_ownership'] = dataset['Curr_ownership'].replace(['None', 'לא מוגדר'], 'לא ידוע')

    ### Create Km_per_Year Column and Remove Km, Year Columns
    dataset['Km'] = dataset['Km'].astype(str).str.replace(',', '').apply(lambda x: float(x) if x.replace('.', '', 1).isdigit() else np.nan)
    dataset['Year'] = pd.to_numeric(dataset['Year'], errors='coerce')
    dataset['Km'] = pd.to_numeric(dataset['Km'], errors='coerce')
    current_year = datetime.now().year
    dataset['Car_Age'] = current_year - dataset['Year']
    dataset['Km_per_Year'] = dataset['Km'] / dataset['Car_Age']
    dataset['Km_per_Year'] = dataset['Km_per_Year'].round(1)
    dataset['Km_per_Year'] = dataset.groupby('Car_Age', group_keys=False)['Km_per_Year'].apply(lambda x: x.fillna(x.mean()))
    dataset = dataset.drop(columns=['Km', 'Year'])

    ### capacity_Engine Column
    dataset['capacity_Engine'] = pd.to_numeric(dataset['capacity_Engine'], errors='coerce')
    dataset.loc[dataset['capacity_Engine'] < 1000, 'capacity_Engine'] = dataset.loc[dataset['capacity_Engine'] < 1000, 'capacity_Engine'] * 10
    dataset['capacity_Engine'] = dataset.groupby(['manufactor', 'model'], group_keys=False)['capacity_Engine'].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else dataset['capacity_Engine'].mode()[0]))
    dataset['capacity_Engine'] = dataset['capacity_Engine'].astype(int)

    

    # encoding
    categorical_columns = ['manufactor', 'model', 'Gear', 'Engine_type', 'Prev_ownership', 'Curr_ownership']

    for col in categorical_columns:
        dataset[col] = dataset[col].fillna(dataset[col].mode()[0])
        
    dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

    

    # Rearrange columns so 'Price' is the last column
    price_column = dataset.pop('Price')
    dataset['Price'] = price_column

    return dataset


In [3]:
prepared_data = prepare_data('dataset.csv')


### Step 2 - Predict Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
import warnings

# Ignore Warning
warnings.filterwarnings('ignore')

# Read the CSV file
cars = 'dataset.csv'
data = prepare_data(cars)

# Identify original categorical columns before get_dummies
categorical_columns = ['manufactor', 'model', 'Gear', 'Engine_type', 'Prev_ownership', 'Curr_ownership']

# Split the data into features and target variable
X = data.drop(columns=['Price'])
y = data['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the Elastic Net model
elastic_net = ElasticNet()

# Perform 10-fold cross-validation and print the average RMSE
cv_scores = cross_val_score(elastic_net, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f'Average RMSE from 10-fold cross-validation: {cv_rmse.mean():.2f}')

# Grid Search for best parameters
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100, 1000],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1]
}
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Train the model with the best parameters
elastic_net_best = ElasticNet(alpha=best_params['alpha'], l1_ratio=best_params['l1_ratio'])
elastic_net_best.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred_best = elastic_net_best.predict(X_test_scaled)

# Calculate and print the RMSE on the test data
rmse_test_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f'Improved RMSE on the test data: {rmse_test_best:.2f}')

# Calculate and print additional metrics on the test data
r2_test_best = r2_score(y_test, y_pred_best)
mae_test_best = mean_absolute_error(y_test, y_pred_best)
medae_test_best = median_absolute_error(y_test, y_pred_best)
print(f'R^2 on the test data: {r2_test_best:.2f}')
print(f'Mean Absolute Error (MAE) on the test data: {mae_test_best:.2f}')

# Get the feature importances
feature_importances = pd.Series(elastic_net_best.coef_, index=X.columns)

# Group feature importances by original columns
original_columns_dict = {col: [dummy_col for dummy_col in X.columns if dummy_col.startswith(f"{col}_")] for col in categorical_columns}

# Calculate sum of absolute importances for each original column
grouped_importances = {}
for original_col, dummy_cols in original_columns_dict.items():
    if dummy_cols:
        grouped_importances[original_col] = feature_importances[dummy_cols].abs().sum()

# Add remaining non-categorical columns
for col in X.columns:
    if col not in [dummy_col for sublist in original_columns_dict.values() for dummy_col in sublist]:
        grouped_importances[col] = abs(feature_importances[col])

# Sort and get top 5 features
grouped_importances_series = pd.Series(grouped_importances).sort_values(ascending=False)
top_5_features = grouped_importances_series.head(5)
print("Top 5 features affecting the price prediction:")
print(top_5_features)

# Display whether the influence of each feature is positive or negative
print("\nInfluence of top 5 features (Positive/Negative):")
for feature in top_5_features.index:
    if feature in original_columns_dict:
        original_feature_coefs = feature_importances[original_columns_dict[feature]]
    else:
        original_feature_coefs = pd.Series([feature_importances[feature]])
    influence = "Positive" if original_feature_coefs.sum() > 0 else "Negative"
    print(f"{feature}: {influence}")


### Data Processing Explanation

During the code writing process, I used graphs and tests to select columns, examine missing values, and more. All this was done to process the data. I removed the testing parts to make the notebook more elegant.

#### Part 1 - Data Arrangement:
- There are functions that generally arrange the data. Of course, the preference is for functions to be as general as possible. However, there are specific cases that I addressed individually (e.g., for the "model" column with Skoda). There is also a specific correction for the "manufacturer" column. In the real world, we won't know what outlier values will appear, but for the sake of learning, I find this helpful.

- Most of the missing values were filled using "group by" with the most common/average value, depending on whether it is numerical or categorical.

- I chose to combine the "Km" and "Year" columns to create a new column that integrates them, allowing us to see if the car has driven above or below the average.

- I decided to create a "Car Age" column instead of the "Year" column so that the model handles smaller numbers.


### Part 2 - Prediction Model

I used Grid Search to optimize the parameters of the Elastic Net model.  
By defining a range of values for `alpha` and `l1_ratio`, Grid Search performed 10-fold cross-validation to evaluate each combination.  
This process identified the best parameter values, leading to an improved RMSE on the test data.

In general, I believe that some students have used GPT
for functions and concepts they do not understand
and doesnt know what happens behind the scenes,
just to improve the model's performance.I do not believe in this approach. 
Therefore, my model might not have the lowest RMSE, but it contains the elements and methods we learned during the course. 
For this reason, I kindly request that you do not lower my grade.


### Note on Model Performance

In general, I believe there are many methods that can improve the model's performance.  
I am sure there are models with a lower RMSE than mine,  
and I think some students used GPT to achieve this,  
with using techniques we did not learn in the course. I do not believe in this approach.  
I chose to use the material taught in the course.  
Therefore, I hope you take this into consideration when grading.


### Note on Submission Format

For convenience, I decided to submit the notebook in the form of two main blocks. 
Of course, it is possible to split it into more detailed steps,
but I find this to be the most convenient. 
Two parts to the assignment - two main blocks.
