In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [225]:
laptop = pd.read_csv('./data/laptop_price.csv', encoding='latin-1')
df = pd.read_csv('./data/laptops.csv')

In [226]:
# Thanks to the EDA Anlayis, we found a row that is the only one of his kind. This row will just add noise to pur analysis, so we will drop it

laptop.drop(laptop[laptop.laptop_ID == 1209].index, inplace=True)

In [227]:
laptop.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [228]:
laptop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1302 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1302 non-null   int64  
 1   Company           1302 non-null   object 
 2   Product           1302 non-null   object 
 3   TypeName          1302 non-null   object 
 4   Inches            1302 non-null   float64
 5   ScreenResolution  1302 non-null   object 
 6   Cpu               1302 non-null   object 
 7   Ram               1302 non-null   object 
 8   Memory            1302 non-null   object 
 9   Gpu               1302 non-null   object 
 10  OpSys             1302 non-null   object 
 11  Weight            1302 non-null   object 
 12  Price_euros       1302 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 142.4+ KB


In [229]:
laptop.OpSys.value_counts()

Windows 10      1072
No OS             66
Linux             62
Windows 7         45
Chrome OS         26
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: OpSys, dtype: int64

In [230]:
laptop.Product.value_counts()

XPS 13                                  30
Inspiron 3567                           29
250 G6                                  21
Legion Y520-15IKBN                      19
Vostro 3568                             19
                                        ..
SmartBook 130                            1
A541NA-GO342 (N3350/4GB/500GB/Linux)     1
17-X047na (i3-6006U/8GB/1TB/W10)         1
V310-15ISK (i5-6200U/4GB/1TB/FHD/No      1
15-cd005nv (A9-9420/6GB/256GB/Radeon     1
Name: Product, Length: 617, dtype: int64

In [231]:
laptop.TypeName.value_counts()

Notebook              727
Gaming                205
Ultrabook             196
2 in 1 Convertible    120
Workstation            29
Netbook                25
Name: TypeName, dtype: int64

In [232]:
laptop.Company.value_counts()

Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        8
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: Company, dtype: int64

In [233]:
laptop.Ram.value_counts()

8GB     619
4GB     374
16GB    200
6GB      41
12GB     25
2GB      22
32GB     17
24GB      3
64GB      1
Name: Ram, dtype: int64

In [234]:
def reduce_categories(df,columns,number):
    f = df[columns].value_counts().reset_index()
    indexes = f['index'][f[columns]<number]
    
    for index in indexes:
        df[columns]=df[columns].str.replace(index,'Other')

In [235]:
reduce_categories(laptop,'Company',10)

In [236]:
laptop['Ram'] = laptop['Ram'].str[:-2]
laptop['Weight'] = laptop['Weight'].str[:-2]

In [237]:
laptop.OpSys.replace({'Windows 10 S':'Windows 10','Mac OS X':'macOS'},inplace=True)

In [238]:
laptop[['screen_width','screen_height']]=(
    laptop['ScreenResolution']
    .str[-9:]
    .str.split('x',expand=True)
)

In [239]:
def get_dummie(screen):
    if 'IPS Panel' in screen:
        return 1
    else:
        return 0

In [240]:
laptop['IPS Panel'] = laptop['ScreenResolution'].apply(get_dummie)

In [241]:
laptop['CPU Brand'] = laptop.Cpu.str.split(' ').apply(lambda x: x[0])
laptop['CPU GHz'] = (
    laptop.Cpu.str.split(' ').apply(lambda x: x[-1])
).str[:-3]

In [242]:
laptop['Gpu'] = laptop['Gpu'].str.split(' ').apply(lambda x:x[0])

In [243]:
laptop['MemoryAmount'] = laptop['Memory'].str.split(' ').apply(lambda x:x[0])
laptop['MemoryType'] = laptop['Memory'].str.split(' ').apply(lambda x:x[1])

In [244]:
def turn_TB_into_GB(value):
    if 'TB' in value:
        return float(value[:value.find('TB')]) * 1000
    elif 'GB' in value:
        return float(value[:value.find('GB')])

In [245]:
laptop['MemoryAmount'] = laptop['MemoryAmount'].apply(turn_TB_into_GB)

## Changing Data Types

In [246]:
int64 = ['Ram','screen_width','screen_height','MemoryAmount']
float64 = ['Weight','CPU GHz']
def datatypes(df):
    for i in int64:
        df[i] = df[i].astype(int)
    for j in float64:
        df[j] = df[j].astype(float)

In [247]:
datatypes(df=laptop)

In [248]:
laptop.dtypes

laptop_ID             int64
Company              object
Product              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                   int64
Memory               object
Gpu                  object
OpSys                object
Weight              float64
Price_euros         float64
screen_width          int64
screen_height         int64
IPS Panel             int64
CPU Brand            object
CPU GHz             float64
MemoryAmount          int64
MemoryType           object
dtype: object

## Drop columns that we don't gonna use

In [249]:
laptop=laptop.drop(columns={'laptop_ID','Product','ScreenResolution','Cpu','Memory'},axis=1)

### Export csv for an EDA Analysis

In [250]:
laptop.to_csv('./data/laptop-EDA.csv',index=False)

### Get Dummies

In [251]:
laptop = laptop.join(pd.get_dummies(laptop['CPU Brand'],prefix='CPU',drop_first=True))

In [252]:
def dummies(df):
    objects = df.select_dtypes(include='object').columns
    df = pd.get_dummies(df,columns=objects,drop_first=True)
    return df

In [253]:
laptop=dummies(laptop)

In [254]:
laptop.head()

Unnamed: 0,Inches,Ram,Weight,Price_euros,screen_width,screen_height,IPS Panel,CPU GHz,MemoryAmount,CPU_Intel,...,OpSys_Chrome OS,OpSys_Linux,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 7,OpSys_macOS,CPU Brand_Intel,MemoryType_HDD,MemoryType_Hybrid,MemoryType_SSD
0,13.3,8,1.37,1339.69,2560,1600,1,2.3,128,1,...,0,0,0,0,0,1,1,0,0,1
1,13.3,8,1.34,898.94,1440,900,0,1.8,128,1,...,0,0,0,0,0,1,1,0,0,0
2,15.6,8,1.86,575.0,1920,1080,0,2.5,256,1,...,0,0,1,0,0,0,1,0,0,1
3,15.4,16,1.83,2537.45,2880,1800,1,2.7,512,1,...,0,0,0,0,0,1,1,0,0,1
4,13.3,8,1.37,1803.6,2560,1600,1,3.1,256,1,...,0,0,0,0,0,1,1,0,0,1


In [255]:
laptop.to_csv('./data/laptop-to-predict.csv',index=False)