In [1]:
#imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

In [2]:
#load dataset

dataset = pd.read_csv('Shark_Tank_India_all_s1_s2_combined.csv')
dataset = dataset.rename(columns=lambda x: x.strip().lower())
dataset = dataset.set_index('id')

dataset = dataset[[
    'industry',
    'yearly revenue',
    'monthly sales',
    'gross margin',
    'net margin',
    'original ask amount',
    'original offered equity',
    'valuation requested',
    'has patents',
    'number of sharks in deal',
    'ashneer investment amount',
    'aman investment amount',
    'anupam investment amount',
    'namita investment amount',
    'vineeta investment amount',
    'ghazal investment amount',
    'peyush investment amount',
    'amit investment amount',
]]
dataset.head()

Unnamed: 0_level_0,industry,yearly revenue,monthly sales,gross margin,net margin,original ask amount,original offered equity,valuation requested,has patents,number of sharks in deal,ashneer investment amount,aman investment amount,anupam investment amount,namita investment amount,vineeta investment amount,ghazal investment amount,peyush investment amount,amit investment amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Food,95.0,8.0,,,50.0,5.0,1000.0,,3.0,25.0,25.0,,,25.0,,,
2,Vehicles/Electrical Vehicles,4.0,0.4,,,40.0,15.0,267.0,,2.0,20.0,,,,20.0,,,
3,Beauty/Fashion,,2.0,,,25.0,10.0,250.0,,2.0,,,12.5,,12.5,,,
4,Food,700.0,,48.0,,70.0,1.0,7000.0,,1.0,70.0,,,,,,,
5,Education,30.0,,,,50.0,5.0,1000.0,,,,,,,,,,


In [3]:
#cleaning data

dataset['has patents'] = dataset['has patents'].fillna(False)
dataset['number of sharks in deal'] = dataset['number of sharks in deal'].fillna(0)
dataset['yearly revenue'] = dataset['number of sharks in deal'].fillna(0).map(lambda x: 0 if x <= 0 else x)
dataset['monthly sales'] = dataset['number of sharks in deal'].fillna(0).map(lambda x: 0 if x <= 0 else x)
dataset['gross margin'] = dataset['number of sharks in deal'].fillna(0).map(lambda x: 0 if x <= 0 else x)
dataset['net margin'] = dataset['number of sharks in deal'].fillna(0).map(lambda x: 0 if x <= 0 else x)
dataset['ashneer investment amount'] = dataset['ashneer investment amount'].fillna(0).map(lambda x: x > 0)
dataset['aman investment amount'] = dataset['aman investment amount'].fillna(0).map(lambda x: x > 0)
dataset['anupam investment amount'] = dataset['anupam investment amount'].fillna(0).map(lambda x: x > 0)
dataset['namita investment amount'] = dataset['namita investment amount'].fillna(0).map(lambda x: x > 0)
dataset['vineeta investment amount'] = dataset['vineeta investment amount'].fillna(0).map(lambda x: x > 0)
dataset['peyush investment amount'] = dataset['peyush investment amount'].fillna(0).map(lambda x: x > 0)
dataset['ghazal investment amount'] = dataset['ghazal investment amount'].fillna(0).map(lambda x: x > 0)
dataset['amit investment amount'] = dataset['amit investment amount'].fillna(0).map(lambda x: x > 0)

dataset.head()

Unnamed: 0_level_0,industry,yearly revenue,monthly sales,gross margin,net margin,original ask amount,original offered equity,valuation requested,has patents,number of sharks in deal,ashneer investment amount,aman investment amount,anupam investment amount,namita investment amount,vineeta investment amount,ghazal investment amount,peyush investment amount,amit investment amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,Food,3.0,3.0,3.0,3.0,50.0,5.0,1000.0,False,3.0,True,True,False,False,True,False,False,False
2,Vehicles/Electrical Vehicles,2.0,2.0,2.0,2.0,40.0,15.0,267.0,False,2.0,True,False,False,False,True,False,False,False
3,Beauty/Fashion,2.0,2.0,2.0,2.0,25.0,10.0,250.0,False,2.0,False,False,True,False,True,False,False,False
4,Food,1.0,1.0,1.0,1.0,70.0,1.0,7000.0,False,1.0,True,False,False,False,False,False,False,False
5,Education,0.0,0.0,0.0,0.0,50.0,5.0,1000.0,False,0.0,False,False,False,False,False,False,False,False


In [56]:
#preprocessing

X = dataset[[
    'industry',
    'yearly revenue',
    'monthly sales',
    'gross margin',
    'net margin',
    'original ask amount',
    'original offered equity',
    'valuation requested',
    'has patents'
]]
industry_dummies = pd.get_dummies(dataset['industry'], prefix='industry')
X = pd.concat([X, industry_dummies], axis=1)
X = X.drop(['industry'], axis=1)
X_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)

y = dataset[[
    'number of sharks in deal',
    'ashneer investment amount',
    'aman investment amount',
    'anupam investment amount',
    'namita investment amount',
    'vineeta investment amount',
    'ghazal investment amount',
    'peyush investment amount',
    'amit investment amount',
]]

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [48]:
pd.DataFrame(y_test)

Unnamed: 0_level_0,number of sharks in deal,ashneer investment amount,aman investment amount,anupam investment amount,namita investment amount,vineeta investment amount,ghazal investment amount,peyush investment amount,amit investment amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
73,1.0,False,True,False,False,False,False,False,False
46,3.0,False,True,True,True,False,False,False,False
318,2.0,False,True,False,False,False,False,False,False
195,2.0,False,True,False,False,False,False,True,False
120,0.0,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
229,1.0,False,True,False,False,False,False,False,False
267,1.0,False,False,False,False,False,False,False,True
102,1.0,False,True,False,False,False,False,False,False
66,3.0,True,True,False,False,True,False,False,False


In [57]:
model = MultiOutputRegressor(LogisticRegression())
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.23213126983823018

In [59]:
pickle.dump(model, open('shark_tank_india_model.sav', 'wb'))