## Logistic Regression

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler

In [16]:
df = pd.read_csv('New_Data_FullyCLeaned.csv')

X = df.drop(columns='price')
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
ohe = OneHotEncoder()
ohe.fit(X[['model','location','engine_type','transmission','year','mileage']])

column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['model','location','engine_type','transmission','year','mileage']),remainder='passthrough')

lr = LinearRegression()

pipe = make_pipeline(column_trans,lr)

pipe.fit(X_train,y_train)

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipe.predict(X_test)

# Evaluate the model performance
# r2 = r2_score(y_test, y_pred)
# print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: 0.82


## Naive bayes Algorithm

In [39]:
# df = pd.read_csv('New_Data_FullyCLeaned.csv')

# X = df.drop(columns='price')
# y = df['price']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ohe = OneHotEncoder()
ohe.fit(X[['model','location','engine_type','transmission']])

column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['model','location','engine_type','transmission']),remainder='passthrough')

pipeline = Pipeline([
    ('tfidf_nb', column_trans),
    ('nb', MultinomialNB())
])

pipe.fit(X_train,y_train)

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipe.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: 0.82


## Random Forest Algorithm

In [42]:
# # Load your CSV data
# df = pd.read_csv('New_Data_FullyCLeaned.csv')

# X = df.drop(columns='price')
# y = df['price']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use OneHotEncoder for categorical features
ohe = OneHotEncoder()
ohe.fit(X[['model', 'location', 'engine_type', 'transmission']])

# Create a column transformer
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['model', 'location', 'engine_type', 'transmission']),
    remainder='passthrough'
)

# Create a pipeline with Random Forest
pipeline = make_pipeline(column_trans, RandomForestRegressor())

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: 0.86


## SVM Algorithm

In [33]:
# Assuming you have a DataFrame 'df' with columns 'model', 'location', 'engine_type', 'transmission', 'year', 'mileage', and 'price'
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['model', 'location', 'engine_type', 'transmission']),
    remainder='passthrough'
)

# Create a pipeline with the column transformer, standard scaler, and SVM regressor
pipeline = make_pipeline(
    column_trans,
    StandardScaler(with_mean=False),  # Standardize features by removing the mean and scaling to unit variance
    SVC()  # Support Vector Regression
)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on the test set: {r2:.2f}')

R-squared score on the test set: -0.32
