In [1]:
# Import required python libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load data from csv file
data = pd.read_csv("company_data.csv", delimiter=",")
data.shape

(7051, 32)

In [3]:
# shows 1st 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,brand,phone,Status,release_year,release_quarter,headphone_jacks,composer,MP3,monophonic,...,Push_Email,Thread,Radio,Resolution,Screen_size,talk_time,weight_in_g,Keyboard,Primary,Secondary
0,1,Acer,Iconia A1-830,Available,2014.0,Q1,Yes,No,Yes,No,...,Yes,No,No,212,7.9,450.0,380.0,,5.0,2.0
1,2,Acer,Iconia B1-720,Available,2014.0,Q1,Yes,No,Yes,No,...,Yes,No,No,188,7.0,300.0,315.0,,0.3,0.0
2,3,Acer,Iconia B1-721,Available,2014.0,Q1,Yes,No,Yes,No,...,Yes,No,No,188,7.0,0.0,323.0,,0.3,0.0
3,4,Acer,Iconia One 7 B1-730,Available,2014.0,Q2,Yes,No,Yes,No,...,Yes,No,No,770,7.0,420.0,0.0,,5.0,0.0
4,5,Acer,Iconia Tab 7 A1-713,Available,2014.0,Q3,Yes,No,Yes,No,...,Yes,Yes,No,188,7.0,360.0,298.0,,2.0,0.0


In [4]:
# days stores the no.of days the product has been in the inventory
days = np.random.random_integers(20, high=150, size=(data.shape[0],))
days

  """Entry point for launching an IPython kernel.


array([101,  82, 127, ...,  69,  70, 111])

In [5]:
# Filling null values with the mean values of the column
data.iloc[:,-7].fillna(data.iloc[:,-7].mean(), inplace=True)

In [6]:
data.iloc[:,-6].fillna(data.iloc[:,-6].mean(), inplace=True)
data.iloc[:,-5].fillna(data.iloc[:,-5].mean(), inplace=True)
data.iloc[:,-4].fillna(data.iloc[:,-4].mean(), inplace=True)
data.iloc[:,-2].fillna(data.iloc[:,-2].mean(), inplace=True)
data.iloc[:,-1].fillna(data.iloc[:,-1].mean(), inplace=True)

In [7]:
# Filling null values of string columns with the mode of the column
data = data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [8]:
# adding the target column to the end
data['status'] = data['Status']
data = data.drop(['Status'],axis=1)
data.shape

(7051, 32)

In [9]:
# Dropping less significant columns
data.drop('Keyboard', axis=1, inplace=True)
data.drop('phone', axis=1, inplace=True)
data.drop('release_year', axis=1, inplace=True)
data.drop('release_quarter', axis=1, inplace=True)

In [10]:
# Label encoding of categorical data to numerical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in ([1,11,-1]):
    array = le.fit_transform(np.array(data.iloc[:,i]))
    data.iloc[:,i] = array
data.head()

Unnamed: 0.1,Unnamed: 0,brand,headphone_jacks,composer,MP3,monophonic,polyphonic,wav,vibration,acc,...,Push_Email,Thread,Radio,Resolution,Screen_size,talk_time,weight_in_g,Primary,Secondary,status
0,1,1,Yes,No,Yes,No,No,Yes,Yes,No,...,Yes,No,No,212,7.9,450.0,380.0,5.0,2.0,0
1,2,1,Yes,No,Yes,No,No,Yes,Yes,No,...,Yes,No,No,188,7.0,300.0,315.0,0.3,0.0,0
2,3,1,Yes,No,Yes,No,No,Yes,Yes,No,...,Yes,No,No,188,7.0,0.0,323.0,0.3,0.0,0
3,4,1,Yes,No,Yes,No,No,Yes,Yes,No,...,Yes,No,No,770,7.0,420.0,0.0,5.0,0.0,0
4,5,1,Yes,No,Yes,No,No,Yes,Yes,No,...,Yes,Yes,No,188,7.0,360.0,298.0,2.0,0.0,0


In [11]:
# Mapping 'Yes' category as 1 and 'No' category as 0
dict_= {'Yes': 1, 'No': 0}
for i in range(2, 10):
    data.iloc[:, i] = data.iloc[:, i].map(dict_)
for i in range(12, 21):
    data.iloc[:, i] = data.iloc[:, i].map(dict_)
data.head()

Unnamed: 0.1,Unnamed: 0,brand,headphone_jacks,composer,MP3,monophonic,polyphonic,wav,vibration,acc,...,Push_Email,Thread,Radio,Resolution,Screen_size,talk_time,weight_in_g,Primary,Secondary,status
0,1,1,1,0,1,0,0,1,1,0,...,1,0,0,212,7.9,450.0,380.0,5.0,2.0,0
1,2,1,1,0,1,0,0,1,1,0,...,1,0,0,188,7.0,300.0,315.0,0.3,0.0,0
2,3,1,1,0,1,0,0,1,1,0,...,1,0,0,188,7.0,0.0,323.0,0.3,0.0,0
3,4,1,1,0,1,0,0,1,1,0,...,1,0,0,770,7.0,420.0,0.0,5.0,0.0,0
4,5,1,1,0,1,0,0,1,1,0,...,1,1,0,188,7.0,360.0,298.0,2.0,0.0,0


In [12]:
# Mapping 'Yes' category as 1 and 'No' category as 0
data.bluetooth = data.bluetooth.map({'yes': 1, 'no': 0})

In [13]:
# Adding no.of days column to the data
data['no_of_days'] = days
data.head()

Unnamed: 0.1,Unnamed: 0,brand,headphone_jacks,composer,MP3,monophonic,polyphonic,wav,vibration,acc,...,Thread,Radio,Resolution,Screen_size,talk_time,weight_in_g,Primary,Secondary,status,no_of_days
0,1,1,1,0,1,0,0,1,1,0,...,0,0,212,7.9,450.0,380.0,5.0,2.0,0,101
1,2,1,1,0,1,0,0,1,1,0,...,0,0,188,7.0,300.0,315.0,0.3,0.0,0,82
2,3,1,1,0,1,0,0,1,1,0,...,0,0,188,7.0,0.0,323.0,0.3,0.0,0,127
3,4,1,1,0,1,0,0,1,1,0,...,0,0,770,7.0,420.0,0.0,5.0,0.0,0,45
4,5,1,1,0,1,0,0,1,1,0,...,1,0,188,7.0,360.0,298.0,2.0,0.0,0,129


In [14]:
# Testing data contains aged products whereas training data contains the rest
train_data = data[data.no_of_days < 90]
test_data = data[data.no_of_days >= 90]

In [15]:
# creating training and testing data to be trained and predicted respectively
X_train = train_data.iloc[:,:-2]
Y_train = train_data.iloc[:,-2]
X_test = test_data.iloc[:,:-2]
Y_test = test_data.iloc[:,-2]

In [22]:
X_train.shape

(3830, 27)

In [16]:
X_test.shape

(3221, 27)

In [17]:
# Normalizing the data to prevent overfitting
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_train
X_test

array([[-1.7451061 , -1.89762714,  0.9650995 , ...,  2.54206589,
         0.51327952,  1.29275159],
       [-1.74412451, -1.89762714,  0.9650995 , ...,  1.98027982,
        -0.83451236, -0.36712256],
       [-1.74314292, -1.89762714,  0.9650995 , ...,  1.73388242,
        -0.34701317, -0.36712256],
       ...,
       [ 1.71206283,  1.76085641, -1.03616259, ..., -0.26686447,
        -0.92054163, -0.36712256],
       [ 1.71353521,  1.76085641, -1.03616259, ..., -0.37527932,
        -0.54774813, -0.36712256],
       [ 1.7150076 ,  1.76085641, -1.03616259, ..., -0.02046707,
        -0.00289609, -0.36712256]])

In [18]:
# Using random forest classifier to predict probability of availability and discontinuity of products
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
clf1 = RandomForestClassifier(n_estimators=500, random_state=0)
clf1.fit(X_train, Y_train)
Y_pred = clf1.predict_proba(X_test)
clf1.score(X_test, Y_test)

0.9379074821484011

In [19]:
# Storing the predictions into a csv file
prediction = pd.DataFrame(Y_pred, columns=['Available','Discontinued'])
prediction.to_csv('Predict1.csv')

In [20]:
# Using SVM classifier for prediction
from sklearn.svm import SVC
clf2 = SVC()
clf2.fit(X_train, Y_train)
Y_pred = clf1.predict(X_test)
clf2.score(X_test, Y_test)

0.9226948152747594

In [21]:
# Using Neural networks for prediction
from sklearn.neural_network import MLPClassifier
clf3 = MLPClassifier(hidden_layer_sizes=(20,15), max_iter=3000)
clf3.fit(X_train, Y_train)
clf3.score(X_test, Y_test)

0.9239366656317913

In [None]:
# We observed that random forest classifier predicted with highest accuracy