In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn import metrics
from scipy import stats

# dataset exlporing

In [None]:
laptop_ds = pd.read_csv('../input/laptop-price/laptop_price.csv', encoding = 'ISO-8859-1')

In [None]:
laptop_ds.head()

In [None]:
laptop_ds.index

In [None]:
laptop_ds.info()

In [None]:
laptop_ds.describe().T

In [None]:
laptop_ds.isna().sum()

In [None]:
sns.heatmap(laptop_ds.isna(), cmap = "viridis", cbar = False, yticklabels = False, xticklabels = True)
plt.text(6, 0, "missing data with yellow", ha = "center" , va = "bottom")

In [None]:
round(laptop_ds['Company'].value_counts() / laptop_ds.shape[0] * 100, 2)

# label Encoding 

In [None]:
laptop_ds.columns

In [None]:
laptop_coded = laptop_ds.copy(deep = True)

In [None]:
les = {}
for i in (['Company', 'Product', 'TypeName',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight']) :
    les[i] = LabelEncoder()
    laptop_coded[i] = les[i].fit_transform(laptop_coded[i])
    

In [None]:
plt.figure(figsize = (20, 15))
sns.heatmap(laptop_coded.corr(), annot = True, cmap = "viridis")
plt.title("coorelations between our features")

In [None]:
print("                Good coorelations")
for i in range(len(laptop_coded.columns)) :
    for j in range(i) :
        if (laptop_coded.corr().iloc[i,j] != 1  and  abs(laptop_coded.corr().iloc[i,j] > 0.4 )) :
            print("(" + laptop_coded.columns[i] + ") and (" + laptop_coded.columns[j] + ")", end = "")
            print("     with correlation  " ,  laptop_coded.corr().iloc[i,j])

# Data Visiualization

In [None]:
laptop_ds['Gpu'].value_counts()

In [None]:
laptop_ds.columns

# bi-varient charts

In [None]:
bi_vars_with_price = ['Company', 'Ram', 'ScreenResolution', 'OpSys','TypeName', 'Inches', 'Memory']

In [None]:
def plot_cat_vars (df, var) :
    plt.figure(figsize = (15, 7))
    plt.title("distribution of " + var + "and Price_euros")
    sns.barplot(x = var, y = 'Price_euros', data = df)
    plt.xticks(rotation = 'vertical')
    plt.ylabel('Price_in_euro')
    plt.show()

In [None]:
for var in bi_vars_with_price :
    plot_cat_vars(laptop_ds, var)

In [None]:
laptop_ds['Inches'].value_counts()

# early conclusions

### 1-The heightest price laptops are of the following categories
###        1- Work station laptops
###        2- high memory capacity (1 tb ssd and 1 tb hdd)
###        3- big inches (18 inche)
###        4- 64 GB Ram 
###        5- Mac-os operating system
###        6- 4k ultra hd / touch screen (3840 * 2160) resolution
###        7- Razer company
### __________________________________________________________________
### 2-The moderate price laptops are of the following categories
###        1- 2 in 1 convertable laptops
###        2- moderate memory capacity (512 GB ssd and 256 GB ssd)
###        3- moderate inches (14 inche)
###        4- 24 GB Ram 
###        5- windows 10 operating system
###        6- touch screen (2400 * 1600) resolution
###        7- microsoft and huawei companies
###___________________________________________________________________
### 3-The lowest price laptops are of the following categories
###        1- Netbook laptops
###        2- low memory capacity (32 Gb flash memorya)
###        3- moderate inches (14.1 inche)
###        4- 2 GB Ram 
###        5- android operating system
###        6- 4k ultra hd / touch screen (1920 * 1080) resolution
###        7- vero company

# Multi_varient charts 

In [None]:
laptop_ds['Price_euros'].describe()

In [None]:
def categorize_the_prices (df) :
    if df['Price_euros'] < 500 :
        return 'Low category'
    if df['Price_euros'] >= 500 and df['Price_euros'] < 1500:
        return 'moderate category'
    if df['Price_euros'] >= 1500 and df['Price_euros'] < 4000 :
        return 'high category'
    if df['Price_euros'] >= 4000 :
        return 'very high category'

In [None]:
laptop_ds['Price_cat'] = laptop_ds.apply(categorize_the_prices , axis = 1)

In [None]:
def cluster_charts (df, var1, var2) :
    df.groupby([var1, var2]).size().unstack(fill_value = 0).plot.bar(figsize = (20,10), title = (var1 + " vs " + var2))
    plt.ylabel("Freq")
    plt.show()

In [None]:
multi_var = ['Company', 'Ram', 'ScreenResolution', 'OpSys','TypeName', 'Inches', 'Memory']

In [None]:
for var1 in multi_var :
    cluster_charts(laptop_ds, var1, 'Price_cat')

# After multi charts conclusion

### 1- most very high category price laptops are produced by lenovo , hp and razer 
### 2- most very high category laptop are of type gaming and notebook
### 3- moderate category laptops have the highest rate of production
### 4- every cat laptop have memory , screen resolution , ram , inches compatable with its category

In [None]:
laptop_ds['Weight'] = laptop_ds['Weight'].str.replace('kg', '')
laptop_ds['Ram'] = laptop_ds['Ram'].str.replace('GB', '')
laptop_ds['Weight'] = laptop_ds['Weight'].astype(float)
laptop_ds['Ram'] = laptop_ds['Ram'].astype(int)

In [None]:
def plot_scatter(df, var1, var2) :
    plt.figure(figsize = (15,7))
    plt.title("scatter " + var1 + " vs " + var2)
    plt.scatter(x = df[var1], y = df[var2], c = "green")
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.show()

In [None]:
scatter_vars = ['Inches','Ram', 'Weight']

In [None]:
for var in scatter_vars :
    plot_scatter(laptop_ds, var, 'Price_euros')

# as the weight increase the price also increase

In [None]:
companies = ['Dell', 
'Lenovo',
'HP',
'Asus',
'Acer',
'MSI',
'Toshiba',
'Apple',
'Samsung',
'Razer',
'Mediacom',
'Microsoft',
'Vero',
'Xiaomi',         
'LG',
'Chuwi',
'Google',
'Fujitsu',
'Huawei']

In [None]:
len(laptop_ds.Price_cat.unique())

In [None]:
def pie_chart(df, cat) :
    exp_arr = [(0.025), (0.025,0.025), (0.025,0.025,0.025), (0.025,0.025,0.025,0.025)]
    df2 = df.groupby(['Price_cat', df.Company[(df.Company == cat)]])
    if len(df2.Price_cat.unique())-1 > 0 :
        df2.size().unstack(fill_value = 0).plot.pie(colors = ['green', 'indigo', 'blue', 'red'], autopct = "%1.2f%%", shadow = True, subplots = True, figsize = (10,10), title = (cat + " production of laptops"), explode = (exp_arr[len(df2.Price_cat.unique())-1]))
    else :
        df2.size().unstack(fill_value = 0).plot.pie(colors = ['green', 'indigo', 'blue', 'red'], autopct = "%1.2f%%", shadow = True, subplots = True, figsize = (10,10), title = (cat + " production of laptops"))

In [None]:
for company in companies :
    pie_chart(laptop_ds, company)

In [None]:
sns.distplot(laptop_ds.Price_euros[(laptop_ds.Price_cat == "moderate category")])

In [None]:
sns.distplot(laptop_ds.Price_euros[(laptop_ds.Price_cat == "high category")])

In [None]:
sns.distplot(laptop_ds.Price_euros[(laptop_ds.Price_cat == "very high category")])

In [None]:
laptop_ds['Price_cat'].value_counts()

In [None]:
sns.distplot(laptop_ds.Price_euros[(laptop_ds.Price_cat == "Low category")])

# modeling 

In [None]:
laptop_ds.drop('laptop_ID', axis = 1, inplace = True)

In [None]:
laptop_model = laptop_ds.copy(deep = True)

In [None]:
les = {}
for i in (['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Memory', 'Gpu', 'OpSys']) :
    les[i] = LabelEncoder()
    laptop_model[i] = les[i].fit_transform(laptop_model[i])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(laptop_model.drop(['Price_euros', 'Price_cat'], axis = 1).values, laptop_model['Price_euros'].values, random_state = 42, test_size = 0.25)

# XGboost regressor

In [None]:
xgb = XGBRegressor(
    objective = "reg:linear",
    seed = 200,
    n_estimators = 200,
    learning_rate = 0.1,
    gamma = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    reg_alpha = 1,
    reg_lambda = 1,
    silent = False
)

In [None]:
xgb.fit(x_train, y_train)

In [None]:
predicted = xgb.predict(x_test)

In [None]:
print("The mean square error is : ", metrics.mean_squared_error(y_test, predicted))

In [None]:
print("accuracy is : ", xgb.score(x_test, y_test))
print("pearson's correlation is : ", stats.pearsonr(y_test, predicted)[0])

# Grid search

In [None]:
params = {
    'seed' : [100 , 150 , 200, 250],
    'n_estimators' : [100 , 150 , 200, 250, 300],
    'learning_rate' : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
}

In [None]:
grid = GridSearchCV(estimator = xgb, param_grid = params, n_jobs = -1, cv =3 , verbose = 1)

In [None]:
xgb_grid = grid.fit(x_train, y_train)

In [None]:
print("best_score : " + str(xgb_grid.best_score_) + " with best params : " + str(xgb_grid.best_params_))

In [None]:
xgb_grid_model = xgb_grid.best_estimator_
predicted_xgb_grid = xgb_grid_model.predict(x_test)
print("accuracy is : ", xgb_grid_model.score(x_test, y_test))
print("pearson's correlation is : ", stats.pearsonr(y_test, predicted_xgb_grid)[0])
print("The mean square error is : ", metrics.mean_squared_error(y_test, predicted_xgb_grid))

# Linear regression

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [None]:
predicted_lr = lr.predict(x_test)

In [None]:
print(metrics.mean_squared_error(y_test, predicted_lr))

In [None]:
print("accuracy is : ", lr.score(x_test, y_test))
print("pearson's correlation is : ", stats.pearsonr(y_test, predicted_lr)[0])

# Support vector machine regressor

In [None]:
svm = SVR(kernel = 'rbf', C = 65000 ,epsilon = 25)
svm.fit(x_train, y_train)
predicted_svm = svm.predict(x_test)

In [None]:
metrics.mean_squared_error(y_test, predicted_svm)

In [None]:
print("accuracy is : ", svm.score(x_test, y_test))
print("pearson's correlation is : ", stats.pearsonr(y_test, predicted_svm)[0])

In [None]:
laptop_model.head()

In [None]:
laptop_ds.head()

In [None]:
laptop_ds.columns

# Voting Regressor

In [None]:
vot = VotingRegressor([('xgbGrid', xgb_grid_model), ('lr', lr), ('svr', svm)])
vot.fit(x_train, y_train)
predicted_vot = vot.predict(x_test)
print("accuracy is : ", vot.score(x_test, y_test))
print("pearson's correlation is : ", stats.pearsonr(y_test, predicted_vot)[0])
print("mean square error is : ", metrics.mean_squared_error)

# Testing

In [None]:
test = ["Apple", "MacBook Pro", "Ultrabook", 13.3, "IPS Panel Retina Display 2560x1600", "Intel Core i5 2.3GHz", 8, "128GB SSD", "Intel Iris Plus Graphics 640", "macOS", 1.37]

In [None]:
test[0] = les['Company'].transform([test[0]])[0]
test[1] = les['Product'].transform([test[1]])[0]
test[2] = les['TypeName'].transform([test[2]])[0]
test[4] = les['ScreenResolution'].transform([test[4]])[0]
test[5] = les['Cpu'].transform([test[5]])[0]
test[7] = les['Memory'].transform([test[7]])[0]
test[8] = les['Gpu'].transform([test[8]])[0]
test[9] = les['OpSys'].transform([test[9]])[0]

In [None]:
test

In [None]:
print(xgb.predict(np.array([test])))

# Thank you for your time :)

# salaaam :)