## Input

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
df_train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
df_test = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

In [3]:
df_train.head()

In [4]:
df_test.head()

In [5]:
df_train.describe()

In [6]:
df_train.info()

## Visualization

In [7]:
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [8]:
df_train.corr()

In [9]:
fig = plt.figure(figsize=(15,12))
sns.heatmap(df_train.corr())

In [10]:
df_train['price_range'].unique()

In [11]:
plt.hist(df_train['battery_power'])
plt.show()

In [12]:
plt.hist(df_train['ram'])
plt.show()

In [13]:
plt.hist(df_train['touch_screen'])
plt.show()

In [14]:
sns.boxplot(df_train['price_range'],df_train['talk_time']) #try

In [15]:
plt.hist(df_train['dual_sim'])
plt.show()

In [16]:
plt.hist(df_train['clock_speed'])

In [17]:
df_train['n_cores'].unique()

In [18]:
plt.hist(df_train['n_cores'])

Phones with 3G supported

In [19]:
labels = ["3G-supported",'Not supported']
values = df_train['three_g'].value_counts().values

In [20]:
fig1, ax1 = plt.subplots()
colors = ['green', 'red']
ax1.pie(values, labels=labels, autopct='%1.1f%%',shadow=True,startangle=90,colors=colors)
plt.show()

Phones with 4G supported

In [21]:
labels = ["4G-supported",'Not supported']
values = df_train['four_g'].value_counts().values
fig1, ax1 = plt.subplots()
colors = ['gold', 'lightskyblue']
ax1.pie(values, labels=labels, autopct='%1.1f%%',shadow=True,startangle=90,colors=colors)
plt.show()

In [22]:
plt.figure(figsize=(10,6))
df_train['fc'].hist(alpha=0.5,color='blue',label='Front camera')
df_train['pc'].hist(alpha=0.5,color='red',label='Primary camera')
plt.legend()
plt.xlabel('MegaPixels')

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
x = df_train.drop('price_range',axis=1)
y = df_train['price_range']

scaler.fit(x)
x_transformed = scaler.transform(x)

x_train,x_test,y_train,y_test = train_test_split(x_transformed,y,test_size=0.3)

## Linear Regression

In [24]:
#Linear Regression:  Linear Regression is the supervised Machine Learning model in which the model finds the best fit linear line between the independent and dependent variable
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [25]:
lm.fit(x_train,y_train)

In [26]:
lm.score(x_train,y_train)

In [27]:
lm

## Logistic Regression

In [28]:
#Logistic Regression: used for predicting the categorical dependent variable using a given set of independent variables

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report


lr = LogisticRegression()
lr.fit(x_train,y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

#  The matrix compares the actual target values with those predicted by the machine learning model
print("Train Set Accuracy:"+str(accuracy_score(y_train_pred,y_train)*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test_pred,y_test)*100))
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))

# model.score(x_train,y_train)


## KNN

In [29]:
#KNN: K-Nearest Neighbour”. It is a supervised machine learning algorithm. The algorithm can be used to solve both classification and regression problem statements
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(x_train,y_train)


In [30]:
knn.score(x_test,y_test)

In [31]:
pred = knn.predict(x_test)

In [32]:
error_rate = []
for i in range(1,20):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    pred_i = knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))

In [33]:
plt.figure(figsize=(10,6))
plt.plot(range(1,20),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=5)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [34]:
matrix=confusion_matrix(y_test,pred)
print(matrix)

## Decision Tree

In [35]:
#Decision Tree: A decision tree is a very specific type of probability tree that enables you to make a decision about some kind of process.
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

In [36]:
dtree.score(x_test,y_test)

## Random Forest

In [37]:
#Random Forest:  It builds decision trees on different samples and takes their majority vote for classification and average in case of regression
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)

In [38]:
rfc.score(x_test,y_test)

## SVM

In [39]:
#SVM: Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.
from sklearn.svm import SVC
modelsvm = SVC()
modelsvm.fit(x_train,y_train)
y_train_pred = modelsvm.predict(x_train)
y_test_pred = modelsvm.predict(x_test)


print("Train Set Accuracy:"+str(accuracy_score(y_train_pred,y_train)*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test_pred,y_test)*100))
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassificationReport:\n%s"%classification_report(y_test_pred,y_test))

## Gradient Boosting

In [40]:
#Gradient Boosting: As gradient boosting is one of the boosting algorithms it is used to minimize bias error of the model
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)


print("Train Set Accuracy:"+str(accuracy_score(y_train_pred,y_train)*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test_pred,y_test)*100))
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassificationReport:\n%s"%classification_report(y_test_pred,y_test))

In [41]:
# from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
# xgb = XGBClassifier(random_state =1)
# cv = cross_val_score(xgb,x_train,y_train,cv=5)
# print(cv)
# print(cv.mean())

In [42]:
# x_train = x_train.astype(int)
# y_train = y_train.astype(int)
# x_test = x_test.astype(int)
# y_test = y_test.astype(int)

In [43]:
from sklearn.ensemble import VotingClassifier
price_classfication = VotingClassifier(estimators = [('rfc', rfc),('knn',knn),('modelsvm',modelsvm),('lr',lr),('dtree',dtree)], voting="hard", n_jobs=-1)
price_classfication.fit(x_train, y_train)
print("Accuracy: ", price_classfication.score(x_test,y_test))