In [1]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.preprocessing import MinMaxScaler
import joblib

# Step 2: Load the Dataset

In [None]:

file_path = 'C:\\Users\\santh\\Downloads\\Processed_Flipdata - Processed_Flipdata.csv'
dataset = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
display(dataset.head())

# Inspect the dataset structure and data types
print("\nDataset Information:")
dataset.info()

# Check for missing values
print("\nMissing Values in Each Column:")
print(dataset.isnull().sum())

# Summary statistics
print("\nSummary Statistics:")
display(dataset.describe())

# Step 3: Data Visualization

In [None]:
# Visualize distributions of features
sns.pairplot(dataset)
plt.title('Pairplot of Features')
plt.show()


# Step 4: Data Preprocessing python



In [None]:
# Convert camera columns to numeric
dataset['Rear Camera'] = dataset['Rear Camera'].str.replace('MP', '').astype(int)
dataset['Front Camera'] = dataset['Front Camera'].str.replace('MP', '').astype(int)

# Convert 'Prize' to numeric (removing commas)
dataset['Prize'] = dataset['Prize'].str.replace(',', '').astype(float)

# Check for non-numeric columns and their data types
print("\nData Types After Conversion:")
print(dataset.dtypes)

# Handle missing values (mean imputation)
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)

# One-hot encoding for categorical variables
dataset = pd.get_dummies(dataset, columns=['Colour', 'Processor_'], drop_first=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_cols = ['Memory', 'RAM', 'Battery_', 'Mobile Height', 'Rear Camera', 'Front Camera']
dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])

# Final dataset after preprocessing
print("\nFinal Preprocessed Dataset:")
display(dataset.head())


# Step 5: Feature Analysis and Extraction python


In [None]:
# Define features (X) and target (y)
X = dataset.drop(['Prize', 'Model'], axis=1, errors='ignore')  # Drop target and any non-numeric columns
y = dataset['Prize']

# RFE for feature selection
model = LinearRegression()
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)
print("\nSelected Features by RFE:")
selected_features_rfe = X.columns[fit.support_]
print(selected_features_rfe)

# Mutual Information Scores
mi_scores = mutual_info_regression(X, y)
mi_scores = pd.Series(mi_scores, index=X.columns)
mi_scores.sort_values(ascending=False, inplace=True)

# Plot Mutual Information Scores
plt.figure(figsize=(12, 6))
mi_scores.plot(kind='bar')
plt.title('Mutual Information Scores')
plt.ylabel('Score')
plt.show()

# Lasso Regression for Feature Selection
lasso = LassoCV(cv=5)
lasso.fit(X, y)
lasso_coef = pd.Series(lasso.coef_, index=X.columns)
lasso_coef = lasso_coef[lasso_coef != 0]  # Keep only non-zero coefficients
print("\nFeatures Selected by Lasso Regression:")
print(lasso_coef)


# Step 6: Model Building python



In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a dictionary to store model performance metrics
model_performance = {}

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train each model and evaluate its performance
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    # Store metrics
    model_performance[model_name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2}

# Display model performance
performance_df = pd.DataFrame(model_performance).T
print("\nModel Performance Metrics:")
display(performance_df)

# Hyperparameter Tuning with GridSearchCV for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_
best_model_name = 'Random Forest (Tuned)'

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
r2_best = r2_score(y_test, y_pred_best)

print(f"\nBest Model: {best_model_name}")
print(f"MAE: {mae_best}, RMSE: {rmse_best}, R²: {r2_best}")


# Step 8: Save the Best Model

In [None]:
# Save the best model
joblib.dump(best_model, 'best_random_forest_model.pkl')
print("Best model saved as 'best_random_forest_model.pkl'.")
