In [58]:
# IMPORTS

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [59]:
np.random.seed(42)
Mileage_set = pd.read_csv("DataSet/auto-mpg.csv")

x = Mileage_set.drop("mpg", axis=1) # - setting x value
y = Mileage_set["mpg"] # - setting y value 

cat_features = ["car name"] # - Changing text to numeric
transformed_mat = ColumnTransformer([("OHE",
                                      OneHotEncoder(sparse_output=False), # - remember to use sparse_output and set it to false, remember it's set to return spacematrix by default 
                                      cat_features)],
                                    remainder = "passthrough")

transformed_mat_2 = transformed_mat.fit_transform(x)
transformed_mat_df = pd.DataFrame(transformed_mat_2)
transformed_mat_3 = transformed_mat_df.drop(307, axis=1)
transformed_mat_df2 = pd.DataFrame(transformed_mat_3)

x_train, x_test, y_train, y_test  = train_test_split(transformed_mat_df2,y, test_size=0.2)

Model_Car = RandomForestRegressor()

Model_Car.fit(x_train, y_train)
#Model_Car.score(x_train, y_train) Model_Car.score(x_test, y_test) - (0.979657387192503, 0.9254980805290847) (for Reference)

# # Reshapping (NOT NEEDED)
# reshaped_ytest = np.array(Mileage_set['mpg']).reshape(398, 1)
# reshaped_ytest_df = pd.DataFrame(reshaped_ytest)
# reshaped_ytest_df

# Predicting
Mileage_preds = Model_Car.predict(x_test)
Mileage_preds_df = pd.DataFrame(Mileage_preds)
Mileage_preds_df

# Calculating error
mae = mean_absolute_error(y_test, Mileage_preds_df)
mse = mean_squared_error(y_test, Mileage_preds_df)
r2s = r2_score(y_test, Mileage_preds_df)

# (mae, mse, r2s)  - (1.4953625000000006, 4.005703262499999, 0.9254980805290847) - Errors Estimation to check the calculated error differnce

In [60]:
Final_Prediction = pd.DataFrame({
    "Actual-Mileage":y_test,
    "Predicted-Mileage":Mileage_preds
})

Final_Prediction

Unnamed: 0,Actual-Mileage,Predicted-Mileage
198,33.0,30.651
396,28.0,29.079
33,19.0,20.501
208,13.0,15.075
93,14.0,14.000
...,...,...
249,19.9,20.643
225,17.5,18.656
367,28.0,29.645
175,29.0,28.459


## This code in the below cell is used to check if at any column in a given dataset , it contais any given char (of our choice) ->

In [61]:
import pandas as pd

# Assuming transformed_mat_df is already defined as your DataFrame

# The character to search for
char_to_search = r'\?'  # Escape the '?' character for regex

# Iterate over each column in the DataFrame
for column in transformed_mat_df.columns:
    if transformed_mat_df[column].dtype == 'object':  # Check if the column contains string data
        # Convert the column to string type to ensure compatibility with .str methods
        transformed_mat_df[column] = transformed_mat_df[column].astype(str)
        
        # Check for the presence of the character
        contains_char = transformed_mat_df[column].str.contains(char_to_search, na=False, regex=True)
        if contains_char.any():
            print(f"Rows in column '{column}' that contain '{char_to_search}':")
            print(transformed_mat_df[contains_char])
            print("\n")

# Finding the positions of '?' within the entries of each column
# for column in transformed_mat_df.columns:
#     if transformed_mat_df[column].dtype == 'object':  # Check if the column contains string data
#         # Convert the column to string type to ensure compatibility with .str methods
#         transformed_mat_df[column] = transformed_mat_df[column].astype(str)
        
#         # Find positions of the character
#         positions = transformed_mat_df[column].str.find(char_to_search[1])  # Use the actual '?' character
#         print(f"Positions of '{char_to_search[1]}' in column '{column}':")
#         print(positions)
#         print("\n")


Rows in column '307' that contain '\?':
     0    1    2    3    4    5    6    7    8    9    ...  302  303  304 305  \
32   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   4   
126  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   6   
330  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   4   
336  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   4   
354  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   4   
374  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   4   

       306 307   308   309 310 311  
32    98.0   ?  2046  19.0  71   1  
126  200.0   ?  2875  17.0  74   1  
330   85.0   ?  1835  17.3  80   2  
336  140.0   ?  2905  14.3  80   1  
354  100.0   ?  2320  15.8  81   2  
374  151.0   ?  3035  20.5  82   1  

[6 rows x 312 columns]




In [4]:

import pandas as pd
Mileage_set = pd.read_csv("DataSet/auto-mpg.csv")
Mileage_set

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
