In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
trainData  = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/train.csv'
testData  = 'https://raw.githubusercontent.com/Datamanim/datarepo/main/mobile/test.csv'

In [3]:
train = pd.read_csv(trainData)
test = pd.read_csv(testData)

In [4]:
print(train.shape, test.shape)

(2000, 21) (1000, 21)


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [15]:
train.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [7]:
X = train.drop(['price_range'], axis=1)
y = train['price_range']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
pipe = Pipeline([('vect', StandardScaler()),('clf', LogisticRegression(random_state=42))])
grid_params_lr = [{'clf__C': [1, 10, 100], 'clf__max_iter': [1000, 10000]}] 
gs = GridSearchCV(pipe, grid_params_lr, n_jobs=1)
gs.fit(X_train, y_train)
accuracy_score(y_test, gs.predict(X_test))

0.9825

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [11]:
rf = RandomForestClassifier(max_depth=10, random_state=42).fit(X_train, y_train)
accuracy_score(y_test, rf.predict(X_test))

0.8875

In [12]:
gb = GradientBoostingClassifier(max_depth=10, random_state=42).fit(X_train, y_train)
accuracy_score(y_test, gb.predict(X_test))

0.9125

In [13]:
dtc = DecisionTreeClassifier(max_depth=50, random_state=42).fit(X_train, y_train)
accuracy_score(y_test, dtc.predict(X_test))

0.825

In [14]:
test = test.drop(['id'], axis=1)
test_pred = rf.predict(test)