In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt 
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] =(20,10)

In [2]:
#Reading data from the csv file
df1 = pd.read_csv("C:/Users/runya/Documents/aivestdataset_new.csv")
df1.head()                  

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,1/5/2015,1/5/2015
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,9/4/2014,3/1/2014,10/14/2014
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878,operating,,,,,1,,1/30/2014,1/30/2014
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1,1/1/2007,3/19/2008,3/19/2008
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,1/1/2010,7/24/2014,7/24/2014


In [3]:
df1.shape

(4999, 14)

In [4]:
df1.groupby('category_list')['category_list'].agg('count')

category_list
3D                                                                                               1
3D Printing                                                                                      2
3D Printing|3D Technology|Printing                                                               1
3D Printing|Architecture|Bitcoin|Construction|Crowdfunding|Cryptocurrency|FinTech|Real Estate    1
3D Printing|Collaborative Consumption|Local|Marketplaces                                         1
                                                                                                ..
iPad|Mobile|Point of Sale|Retail|SaaS|Software|Web Development|Web Tools                         1
iPad|iPhone|Mobile|Music                                                                         1
iPhone|Lifestyle|Online Shopping|Retail                                                          1
iPhone|Location Based Services|Mobile|Software|Startups                                        

In [5]:
#Droping unwanted cells or columns
df2 = df1.drop(['name','permalink','homepage_url','country_code','state_code','region','funding_rounds','first_funding_at','last_funding_at','city'], axis='columns')
df2.head()

Unnamed: 0,category_list,funding_total_usd,status,founded_at
0,Media,10000000,operating,
1,Application Platforms|Real Time|Social Network...,700000,operating,9/4/2014
2,Apps|Games|Mobile,3406878,operating,
3,Curated Web,2000000,operating,1/1/2007
4,Software,-,operating,1/1/2010


In [6]:
#Data Cleaning
df2.isnull().sum()

category_list         225
funding_total_usd       0
status                  0
founded_at           1213
dtype: int64

In [7]:
# droping null or empty rows
df3 = df2.dropna()
df3.isnull().sum()

category_list        0
funding_total_usd    0
status               0
founded_at           0
dtype: int64

In [8]:
#checking total number of rows and columns
df3.shape

(3683, 4)

In [9]:
df3['category_list'].unique()

array(['Application Platforms|Real Time|Social Network Media',
       'Curated Web', 'Software', ...,
       'Big Data|Enterprise Software|Health Diagnostics|Internet of Things',
       'Home Automation|Internet of Things|Security|Technology',
       'Digital Media|Social Media'], dtype=object)

In [10]:
# Create new column for success, 1 = “operating or “acquired” and 0 = “closed”
df3['success'] = df3['status'].apply(lambda x: 0 if 'closed' in x.lower() else 1)
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['success'] = df3['status'].apply(lambda x: 0 if 'closed' in x.lower() else 1)


Unnamed: 0,category_list,funding_total_usd,status,founded_at,success
1,Application Platforms|Real Time|Social Network...,700000,operating,9/4/2014,1
3,Curated Web,2000000,operating,1/1/2007,1
4,Software,-,operating,1/1/2010,1
6,Biotechnology,762851,operating,1/1/1997,1
7,Analytics,33600000,operating,1/1/2011,1


In [11]:
df5 = df3.copy()
df5.category_list.unique()

array(['Application Platforms|Real Time|Social Network Media',
       'Curated Web', 'Software', ...,
       'Big Data|Enterprise Software|Health Diagnostics|Internet of Things',
       'Home Automation|Internet of Things|Security|Technology',
       'Digital Media|Social Media'], dtype=object)

In [12]:
df5.category_list = df5.category_list.apply(lambda x: x.strip())
category_stats = df5.groupby('category_list')['category_list'].agg('count').sort_values(ascending=False)
category_stats.head()

category_list
Biotechnology    296
Software         249
Mobile            88
Advertising       76
Health Care       65
Name: category_list, dtype: int64

In [13]:
category_stats_less_200 = category_stats[category_stats<=30]
category_stats_less_200.head()

category_list
Real Estate    29
Games          28
Security       28
Consulting     27
Education      26
Name: category_list, dtype: int64

In [14]:
#Changing other categories to other
df5.category_list = df5.category_list.apply(lambda x: 'other' if x in category_stats_less_200 else x)
len(df5.category_list.unique())

14

In [15]:
df5.head(10)

Unnamed: 0,category_list,funding_total_usd,status,founded_at,success
1,other,700000,operating,9/4/2014,1
3,Curated Web,2000000,operating,1/1/2007,1
4,Software,-,operating,1/1/2010,1
6,Biotechnology,762851,operating,1/1/1997,1
7,other,33600000,operating,1/1/2011,1
8,Mobile,1150050,operating,8/1/2011,1
9,E-Commerce,40000,operating,1/1/2012,1
11,other,-,operating,12/7/2013,1
12,Health and Wellness,-,operating,1/1/1986,1
13,Health and Wellness,1750000,operating,1/1/1984,1


In [16]:
df5.funding_total_usd.head()

1      700000
3     2000000
4           -
6      762851
7    33600000
Name: funding_total_usd, dtype: object

In [17]:
df5.funding_total_usd.unique()

array(['700000', '2000000', '-', ..., '48000000', '6421353', '96700000'],
      dtype=object)

In [18]:
df5['funding_total_usd'].replace(to_replace='-',value=9053238,inplace=True)

In [19]:
df5.funding_total_usd.head()

1      700000
3     2000000
4     9053238
6      762851
7    33600000
Name: funding_total_usd, dtype: object

In [20]:
#Cleaning date and droping columns
df10 = df5.drop('status', axis='columns')
df10['founded_at'] = pd.to_datetime(df10['founded_at'])
df10['founded_year'] = df10['founded_at'].dt.year 
df10.head()

Unnamed: 0,category_list,funding_total_usd,founded_at,success,founded_year
1,other,700000,2014-09-04,1,2014
3,Curated Web,2000000,2007-01-01,1,2007
4,Software,9053238,2010-01-01,1,2010
6,Biotechnology,762851,1997-01-01,1,1997
7,other,33600000,2011-01-01,1,2011


In [21]:
#Getting dummy data on the model to convert string data into values.
dummies = pd.get_dummies(df10[['category_list']])
dummies.head()

Unnamed: 0,category_list_Advertising,category_list_Biotechnology,category_list_Clean Technology,category_list_Curated Web,category_list_E-Commerce,category_list_Enterprise Software,category_list_Hardware + Software,category_list_Health Care,category_list_Health and Wellness,category_list_Manufacturing,category_list_Mobile,category_list_Semiconductors,category_list_Software,category_list_other
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [22]:
df11 = pd.concat([df10, dummies], axis='columns')
df11.head(4)

Unnamed: 0,category_list,funding_total_usd,founded_at,success,founded_year,category_list_Advertising,category_list_Biotechnology,category_list_Clean Technology,category_list_Curated Web,category_list_E-Commerce,category_list_Enterprise Software,category_list_Hardware + Software,category_list_Health Care,category_list_Health and Wellness,category_list_Manufacturing,category_list_Mobile,category_list_Semiconductors,category_list_Software,category_list_other
1,other,700000,2014-09-04,1,2014,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Curated Web,2000000,2007-01-01,1,2007,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,Software,9053238,2010-01-01,1,2010,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6,Biotechnology,762851,1997-01-01,1,1997,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
#drop category_list from the datafile 11
df12 = df11.drop(['category_list','founded_at'], axis=1)
df12.head(4)

Unnamed: 0,funding_total_usd,success,founded_year,category_list_Advertising,category_list_Biotechnology,category_list_Clean Technology,category_list_Curated Web,category_list_E-Commerce,category_list_Enterprise Software,category_list_Hardware + Software,category_list_Health Care,category_list_Health and Wellness,category_list_Manufacturing,category_list_Mobile,category_list_Semiconductors,category_list_Software,category_list_other
1,700000,1,2014,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2000000,1,2007,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,9053238,1,2010,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6,762851,1,1997,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
#Creating x variable and drop success column since it is our dependent variable for our model training
X = df12.drop('success', axis='columns')
X.head()

Unnamed: 0,funding_total_usd,founded_year,category_list_Advertising,category_list_Biotechnology,category_list_Clean Technology,category_list_Curated Web,category_list_E-Commerce,category_list_Enterprise Software,category_list_Hardware + Software,category_list_Health Care,category_list_Health and Wellness,category_list_Manufacturing,category_list_Mobile,category_list_Semiconductors,category_list_Software,category_list_other
1,700000,2014,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2000000,2007,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,9053238,2010,0,0,0,0,0,0,0,0,0,0,0,0,1,0
6,762851,1997,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7,33600000,2011,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [25]:
#creating the value of y
y = df12.success
y.head()

1    1
3    1
4    1
6    1
7    1
Name: success, dtype: int64

In [26]:
#Creating our model and creating the variables x and y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# lr_clf.score(X_test,y_test)
accuracy_score(y_test, np.around(lr_clf.predict(X_test),0))

0.9253731343283582

In [28]:
#Randomise or shuffle the data
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=1)
cross_val_score(LinearRegression(), X,y, cv=cv)

array([-0.0013101 , -0.01311883, -0.03062019, -0.00385689,  0.00654174])

In [29]:
#Prediction function testing
# X.columns
# np.where(X.columns=='category_list_Curated Web')[0][0] 

In [32]:
#Prediction Model function Creating
def predict(category_list,funding_total_usd,founded_year):
    loc_index = np.where(X.columns==category_list)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = funding_total_usd
    x[1] = founded_year
    if(loc_index >= 0):
        x[loc_index] = 1
    return lr_clf.predict([x])[0]  

In [33]:
#Testing Prediction Model
predict('category_list_Software',500000,2022)

0.9380178222353305

In [34]:
#Exporting to pickle file
import pickle
with open('aivest_startupSuccess_prediction_system.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [35]:
#Exporting to json file
import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))