In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('custom_sales_data.csv')

In [129]:
import numpy as np
import pandas as pd

#plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.metrics
#data preprocessing for the models. Never preprocess before exploration if you can help it
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

#Selecting train and test data.
from sklearn.model_selection import train_test_split

#metrics to check our model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error

#remove warnings every time we plot or apply a model to make notebook prettier
import warnings
warnings.filterwarnings('ignore')

#graphs are displayed in the cell. Magic functions op
%matplotlib inline

In [41]:
class StatisticalModelImplenter():
    
    '''
    Parent class containing common functions 
    '''
    def __init__(self):

        self.all_models = []
        self.all_model_names = []

    def get_models(self):

        '''
        returns a dictionary containing names with the corosponing model
        '''
        model_dict = {self.all_models[i]: self.all_model_names[i] for i in range(len(self.all_models))}
        return model_dict

    def fit_train(self, x_train, y_train):
        
        self.x_train = x_train
        self.y_train = y_train

    def fit_test_set(self, user_x_test, user_y_test):

        self.x_test = user_x_test
        self.y_test = user_y_test

    def apply_metric(self,  metric):

        metric_list = []
        for _, model in enumerate(self.all_models):
            metric_item = metric(self.y_test, model.predict(self.x_test))
            metric_list.append(metric_item)

        self.report_printer(metric_list)

    def report_printer(self, list_of_metric):

        all_model_metrics = dict(zip(self.all_model_names, list_of_metric))

        for name, matrix in all_model_metrics.items():
            print('{}\n{}\n\n'.format(name, matrix))
            
    def best_model(self):
        
        output_count = []
        for _, model in enumerate(self.all_models):
            
            list_of_outputs = list(model.predict(self.__x_test))
            output_count.append(self.count_values(list_of_outputs))
        
        max_predicted_val = self.count_values(output_count)
        print(max_predicted_val)

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# ensemble models for better performance in classification
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

In [326]:
def root_mean_squared_error(y_real, y_predicted):
    return np.sqrt(mean_squared_error(y_real, y_predicted))

class Regression(StatisticalModelImplenter):
    '''
    This class implements multiple regression algorithms to already encoded and scaled data. 
    It required the data to be numeric and divided into train and test sets.
    '''
    def __init__(self):

        self.linear_regressor = LinearRegression()
        self.support_vector_regressor = SVR()
        self.decision_tree_regressor = DecisionTreeRegressor()
        self.random_forest_regressor = RandomForestRegressor()
        self.adaboost_regressor = AdaBoostRegressor()
                
        self.all_models = [self.linear_regressor, self.support_vector_regressor, self.decision_tree_regressor, 
                           self.random_forest_regressor, self.adaboost_regressor]
        self.all_model_names = ['Linear Regression', 'Support Vector Regressor', 'Decision Tree Regressor', 
                           'Random Forest Regressor', 'Adaboost Regressor']
        
        self.metric_names = ['Train Score', 'Test Score', 'Classification Report', 'Confusion Matrix',
                            'F1 Score', 'Accuracy Score']
        self.train_scores = []
        self.test_scores = []        
        self.metric_list = [mean_absolute_error, mean_squared_error, root_mean_squared_error]
        self.metrics = []
        data = {'Model Names': self.all_model_names}
        self.all_model_info = pd.DataFrame(data)
    
    def fit(self, x_train, x_test, y_train, y_test):
        '''
        fits models to data and stores results for metrics
        '''
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        for model in self.all_models:
            model.fit(x_train, y_train)
            
            train_score = model.score(x_train,y_train)
            self.train_scores.append(train_score)
            
            test_score = model.score(x_train,y_train)
            self.test_scores.append(train_score)
            y_predict = model.predict(self.x_test)
            
        self.all_model_info['Train Score'] = self.train_scores
        self.all_model_info['Test Score'] = self.test_scores
        self.apply_metrics()

    def apply_metrics(self):
        self.metrics = []
        for metric in self.metric_list:
            metric_name = str(metric).split(' ')[1]
            print(metric_name)
            for model in self.all_models:
                metric_item = metric(self.y_test, model.predict(self.x_test))
                self.metrics.append(metric_item)
                
            self.all_model_info[metric_name] = self.metrics
            self.metrics = []

    def display_report(self):
        return self.all_model_info

In [327]:
r = Regression()

In [328]:
r.fit(xtrain, xtest, ytrain, ytest)

mean_absolute_error
mean_squared_error
root_mean_squared_error


In [329]:
r.display_report()

Unnamed: 0,Model Names,Train Score,Test Score,mean_absolute_error,mean_squared_error,root_mean_squared_error
0,Linear Regression,0.526584,0.526584,191.903998,62666.156834,250.332093
1,Support Vector Regressor,0.005486,0.005486,159.639918,34674.214065,186.210134
2,Decision Tree Regressor,1.0,1.0,195.714286,72414.285714,269.099026
3,Random Forest Regressor,0.876557,0.876557,165.685714,45897.58,214.237205
4,Adaboost Regressor,0.982384,0.982384,199.234694,72835.732507,269.88096


In [44]:
l1 = ['m1', 'm2', 'm3', 'm4', 'm5']
l2 = ['m1', 'm2', 'm3', 'm4', 'm5']
l3 = [1,2,3,4,5]
l4 = ['m1', 'm2', 'm3', 'm4', 'm5']
l5 = ['v1', 'v2', 'v3', 'v4', 'v5']

In [45]:
s1 = pd.Series(l1)
s2 = pd.Series(l2)
s3 = pd.Series(l3)
s4 = pd.Series(l4)
s5 = pd.Series(l5)

In [46]:
data = {'Model Names': s1, 'Models': s2, 'Value1': s3, 'Value2': s4, 'Value 5': s5}

In [73]:
dfx = pd.DataFrame(data)

In [74]:
dfx.head()

Unnamed: 0,Model Names,Models,Value1,Value2,Value 5
0,m1,m1,1,m1,v1
1,m2,m2,2,m2,v2
2,m3,m3,3,m3,v3
3,m4,m4,4,m4,v4
4,m5,m5,5,m5,v5


In [48]:
df.head()

Unnamed: 0,rating,sales_in_month_1,sales_in_month_2,sales_in_month_3
0,,200,500,300
1,,400,300,250
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [49]:
df.fillna('0', inplace = True)
df.head()

Unnamed: 0,rating,sales_in_month_1,sales_in_month_2,sales_in_month_3
0,0,200,500,300
1,0,400,300,250
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [82]:
df['rating'].unique()

array([0, 3, 4, 6, 2, 8, 7, 1, 5, 9])

In [83]:
le = LabelEncoder()
df['rating'] = le.fit_transform(df['rating'])

In [84]:
x = df.iloc[:, 0:3]
y = df.iloc[:, -1]

In [85]:
x['rating'].unique()

array([0, 3, 4, 6, 2, 8, 7, 1, 5, 9], dtype=int64)

In [86]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.33)
lr = LinearRegression()

In [87]:
lr.fit(xtrain, ytrain)

LinearRegression()

In [88]:
train_scores = []
test_scores = []

In [89]:
print(lr.score(xtrain,ytrain))
print(lr.score(xtest,ytest))
x = lr.score(xtrain,ytrain)
train_scores.append(x)
x = lr.score(xtest,ytest)
test_scores.append(x)

0.5265844496753407
-0.955822729226498


In [90]:
svr = SVR()
svr.fit(xtrain, ytrain)

SVR()

In [91]:
svr.score(xtrain, ytrain)

0.005485811813321861

In [92]:
x = svr.score(xtrain,ytrain)
train_scores.append(x)
x = svr.score(xtest,ytest)
test_scores.append(x)

In [93]:
dtr = DecisionTreeRegressor()
dtr.fit(xtrain, ytrain)
dtr.score(xtrain, ytrain)

1.0

In [94]:
x = dtr.score(xtrain,ytrain)
train_scores.append(x)
x = dtr.score(xtest,ytest)
test_scores.append(x)

In [95]:
rfr = RandomForestRegressor()
rfr.fit(xtrain, ytrain)
rfr.score(xtrain, ytrain)

0.8886010842644693

In [96]:
x = rfr.score(xtrain,ytrain)
train_scores.append(x)
x = rfr.score(xtest,ytest)
test_scores.append(x)

In [97]:
abr = AdaBoostRegressor()
abr.fit(xtrain, ytrain)
abr.score(xtrain, ytrain)

0.9849401371879887

In [98]:
x = abr.score(xtrain,ytrain)
train_scores.append(x)
x = abr.score(xtest,ytest)
test_scores.append(x)

In [99]:
model_list = [lr, svr, rfr, dtr, abr]
model_names = ['lr', 'svr', 'rfr', 'dtr', 'abr']

In [100]:
data2 = {'Model_Names': model_names, 'Models': model_list, 'train_scores': train_scores, 'test_Scores': test_scores}
dfx = pd.DataFrame(data2)
dfx

Unnamed: 0,Model_Names,Models,train_scores,test_Scores
0,lr,LinearRegression(),0.526584,-0.955823
1,svr,SVR(),0.005486,-0.082189
2,rfr,"(DecisionTreeRegressor(max_features='auto', ra...",1.0,-1.942675
3,dtr,DecisionTreeRegressor(),0.888601,-0.565652
4,abr,"(DecisionTreeRegressor(max_depth=3, random_sta...",0.98494,-0.768838


In [101]:
classification_report

<function sklearn.metrics._classification.classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')>