In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from random import random
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import mean_absolute_error
import matplotlib
import matplotlib.pyplot as plt

In [10]:
# Load data
df_laptops_original = pd.read_csv(r"../../data/train.csv", sep=',')
df_laptops = df_laptops_original.copy()

### Cleaning data and solving missing values

In [11]:
# Get all touchscreen values to lower case letters (f.ex: from 'Glossy' --> 'glossy')
df_laptops['screen_surface'].replace({'Glossy': 'glossy', 'Matte': 'matte'}, inplace=True)

In [12]:
# Detect missing values
df_laptops.fillna(value=np.nan,inplace=True)
null_data = df_laptops[df_laptops.isnull().any(axis=1)]
df_laptops.columns[df_laptops.isnull().any()]  # ['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os', ...
       # 'os_details', 'weight']

Index(['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os',
       'os_details', 'weight'],
      dtype='object')

In [13]:
# Replace NaN screen_surface values with 'glossy' or 'matte' at random
for i in range(0,len(df_laptops)):
    if type(df_laptops.screen_surface[i]) == float:
        if random() >= 0.5:
            df_laptops.at[i,'screen_surface'] = 'glossy'
        else:
            df_laptops.at[i,'screen_surface'] = 'matte'

In [14]:
# Replace weight missing values by the median of all the weights
df_laptops['weight'] = df_laptops['weight'].fillna(value=df_laptops['weight'].median());

In [15]:
# Replace missing OS and OS_details based on brand
for i in range(0,len(df_laptops)):
    if type(df_laptops.os[i]) == float:        # When the value is missing
        if 'apple' in df_laptops.brand[i].lower():
            df_laptops.at[i,'os_details'] = 'macOS Catalina'
            df_laptops.at[i,'os'] = 'macOS'
        else:
            df_laptops.at[i,'os_details'] = 'Windows'
            df_laptops.at[i,'os'] = 'Windows 10'

In [16]:
# df_laptops.describe()
# Make index and column arrays to then use in the creation of the new df
index_array = np.array(range(0,len(df_laptops)))
column_names_array = np.array(list(df_laptops.columns), dtype=object)

In [17]:
# replace rest of missing values with most frequent simple imputer
imp = SimpleImputer(strategy="most_frequent")
temp_array = imp.fit_transform(df_laptops)

no_nulls_df = pd.DataFrame(data=temp_array[0:,0:],index=index_array, columns=column_names_array)

In [18]:
# Last missing values check
null_data = no_nulls_df[no_nulls_df.isnull().any(axis=1)]

In [25]:
# Remove unimportant columns
most_important_features_df = no_nulls_df.copy()
most_important_features_df = most_important_features_df.drop(columns = ['id',
                                                                        'name',         # Name information is to be found in other columns
                                                                        'base_name',    # Base name is partially in brand
                                                                        'os',           # OS_details is more important
                                                                        'discrete_gpu', # Information contained in GPU information
                                                                        'cpu_details']) # Only CPU column is good

In [20]:
# One-hot encoding
temp_df = most_important_features_df
# df_brand = pd.DataFrame({'brand':list(temp_df.brand.unique())})

# Encode glossy and matte as 1 and 0 respectively
for i in range(0,len(temp_df)):
    if temp_df.screen_surface[i] == 'glossy':
        temp_df.at[i,'screen_surface'] = 1
    else:
        temp_df.at[i,'screen_surface'] = 0

# pd.get_dummies(df_brand,prefix=['brand'])
brand_df = pd.get_dummies(temp_df['brand'],prefix='brand')
cpu_df = pd.get_dummies(temp_df['cpu'],prefix='cpu')
gpu_df = pd.get_dummies(temp_df['gpu'],prefix='gpu')
os_df = pd.get_dummies(temp_df['os_details'],prefix='os_details')

In [21]:
final_df = pd.concat([brand_df, cpu_df, gpu_df, os_df, temp_df], axis=1)

In [24]:
# Drop unnesecary columns
final_df = final_df.drop(['brand', 'cpu', 'gpu', 'os_details'], axis=1)

# Write final df to csv
final_df.to_csv(r'../../data/preprocessed_data.csv', index = False)