In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [132]:
laptop = pd.read_csv('laptop.csv')
laptop.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [133]:
laptop.drop('Unnamed: 0.1', axis=1, inplace=True)

In [139]:
laptop['Inches'] = pd.to_numeric(laptop['Inches'], errors='coerce')

**Separating category for numerical and strings**

In [195]:
cat_features = [col for col in laptop.columns if laptop[col].dropna().apply(lambda x: isinstance(x, str)).all()]
num_features = [col for col in laptop.columns if laptop[col].apply(lambda x: isinstance(x, (float, int))). all()]

**Filling missing values using scikit-learn library**

In [196]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [197]:
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

In [198]:
filler = ColumnTransformer([('Categorical_Filler', cat_imputer, cat_features),
                            ('Numerical_Filler', num_imputer, num_features)],
                             remainder='passthrough')
filled = filler.fit_transform(laptop)

In [199]:
filled_missing = pd.DataFrame(filled, columns = (cat_features+num_features))

In [200]:
filled_df = filled_missing[laptop.columns]
filled_df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [203]:
x = filled_df.drop(columns=['Price'])
y = filled_df['Price']

In [204]:
from sklearn.preprocessing import OneHotEncoder

In [206]:
transform = ColumnTransformer([('Categorical_Conversion',
                                OneHotEncoder(),
                                cat_features)],
                              remainder='passthrough')
transformed = transform.fit_transform(x)

In [207]:
transformed_df = pd.DataFrame(transformed.toarray())

In [212]:
transformed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,533,534,535,536,537,538,539,540,541,542
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.3
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,13.3
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,15.6
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,15.4
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,13.3


In [221]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [232]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_df, y, test_size=0.2)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

In [233]:
model.score(X_test, y_test)

0.7521360115901249

**Model Evaluation**
* r2-score method
* Mean absolute Error Method
* Mean Squared Error

In [235]:
y_pred = model.predict(X_test)

**r2-score method**

In [236]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [237]:
r2_score(y_test, y_pred)

0.7521360115901249

**Mean Absolute Error**

In [239]:
mean_absolute_error(y_test, y_pred)

10131.934417438908

In [243]:
df = pd.DataFrame({'Actual Value':y_test, 'Predicted Value':y_pred, 'Difference':abs(y_test-y_pred)})

In [244]:
df['Difference'].mean()

10131.934417438906

**Mean Squared Error**

In [246]:
mean_squared_error(y_test, y_pred)

352863657.093321

In [247]:
df['Squared Difference'] = np.square(df['Difference'])

In [248]:
df.head()

Unnamed: 0,Actual Value,Predicted Value,Difference,Squared Difference
479,89084.16,59788.141344,29296.018656,858256709.092701
1022,61218.72,58407.342192,2811.377808,7903845.179315
298,26586.72,31430.612592,4843.892592,23463295.442833
1265,47898.72,44144.13168,3754.58832,14096933.45268
774,59955.814073,59955.814073,0.0,0.0


In [249]:
df['Squared Difference'].mean()

352863657.0933208