In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('clean_data.csv')
df

Unnamed: 0,state_code_clusters,zip_cluster,city_clusters,startup_age_in_years,funding_rounds,age_first_funding_year,age_last_funding_year,funding_age_years,years_since_last_funding,milestones,age_first_milestone_year,age_last_milestone_year,relationships,funding_total_usd,has_VC,has_angel,avg_participants,status,is_top500
0,0,0,0,18,3,2,3,1,15,3,5,7,3,375000,0,1,1.0000,acquired,0
1,1,1,1,25,4,5,10,5,15,1,7,7,9,40100000,1,0,4.7500,acquired,1
2,1,1,1,15,1,1,1,0,14,2,1,2,5,2600000,0,0,4.0000,acquired,1
3,1,1,1,23,3,3,5,2,18,1,6,6,5,40000000,0,0,3.3333,acquired,1
4,1,1,1,2,2,0,2,2,0,1,0,0,2,1300000,1,1,1.0000,closed,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,1,1,1,16,1,1,1,0,15,2,1,5,9,1100000,0,0,6.0000,acquired,1
918,0,0,0,10,3,7,9,2,1,1,6,6,1,52000000,1,0,2.6667,closed,1
919,1,1,1,13,1,8,8,0,5,1,9,9,5,44000000,0,0,8.0000,closed,1
920,1,1,1,16,2,1,3,2,13,2,1,4,12,15500000,0,0,1.0000,acquired,1


In [3]:
df2=df.drop(['state_code_clusters','zip_cluster','city_clusters','age_first_funding_year','funding_total_usd','has_VC'],axis=1)
df2.head()

Unnamed: 0,startup_age_in_years,funding_rounds,age_last_funding_year,funding_age_years,years_since_last_funding,milestones,age_first_milestone_year,age_last_milestone_year,relationships,has_angel,avg_participants,status,is_top500
0,18,3,3,1,15,3,5,7,3,1,1.0,acquired,0
1,25,4,10,5,15,1,7,7,9,0,4.75,acquired,1
2,15,1,1,0,14,2,1,2,5,0,4.0,acquired,1
3,23,3,5,2,18,1,6,6,5,0,3.3333,acquired,1
4,2,2,2,2,0,1,0,0,2,1,1.0,closed,1


In [4]:
X = df2.iloc[:,:-1]
y = df2['is_top500']

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [6]:
# Define the preprocessing steps
numerical_features = ['startup_age_in_years','funding_rounds',  'age_last_funding_year', 'funding_age_years', 
                      'years_since_last_funding', 'milestones', 'age_first_milestone_year', 
                      'age_last_milestone_year', 'relationships', 'avg_participants']
categorical_features = ['has_angel', 'status']


In [7]:
# Preprocessing for numerical data
numerical_transformer = StandardScaler()

In [8]:
# Preprocessing for categorical data
# Apply LabelEncoder to 'status' before the pipeline (as it is not directly compatible with ColumnTransformer)
label_encoder=LabelEncoder()
df2['status'] = label_encoder.fit_transform(df2['status'])

In [9]:
# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', 'passthrough', categorical_features)  # 'passthrough' as 'status' is already encoded
    ])

In [10]:
# Create the pipeline with the preprocessor and the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [11]:
# Train the model
X = df2.iloc[:, :-1]  # Features
y = df2['is_top500']  # Target

X

Unnamed: 0,startup_age_in_years,funding_rounds,age_last_funding_year,funding_age_years,years_since_last_funding,milestones,age_first_milestone_year,age_last_milestone_year,relationships,has_angel,avg_participants,status
0,18,3,3,1,15,3,5,7,3,1,1.0000,0
1,25,4,10,5,15,1,7,7,9,0,4.7500,0
2,15,1,1,0,14,2,1,2,5,0,4.0000,0
3,23,3,5,2,18,1,6,6,5,0,3.3333,0
4,2,2,2,2,0,1,0,0,2,1,1.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
917,16,1,1,0,15,2,1,5,9,0,6.0000,0
918,10,3,9,2,1,1,6,6,1,0,2.6667,1
919,13,1,8,0,5,1,9,9,5,0,8.0000,1
920,16,2,3,2,13,2,1,4,12,0,1.0000,0


In [12]:
model.fit(X, y)

In [13]:
dict(df2.iloc[25,:])

{'startup_age_in_years': 23.0,
 'funding_rounds': 1.0,
 'age_last_funding_year': 5.0,
 'funding_age_years': 0.0,
 'years_since_last_funding': 18.0,
 'milestones': 1.0,
 'age_first_milestone_year': 1.0,
 'age_last_milestone_year': 1.0,
 'relationships': 3.0,
 'has_angel': 0.0,
 'avg_participants': 4.0,
 'status': 0.0,
 'is_top500': 1.0}

In [14]:
# Test on new data
new_data = pd.DataFrame({
    'startup_age_in_years': [22],
    'funding_rounds': [1],
    'age_last_funding_year': [5],
    'funding_age_years': [0],
    'years_since_last_funding': [17],
    'milestones': [1],
    'age_first_milestone_year': [1],
    'age_last_milestone_year': [1],
    'relationships': [3],
    'has_angel': [0],
    'avg_participants': [4.0],
    'status': label_encoder.fit_transform(['acquired'])  # Transform the 'status' value
})

In [15]:
new_data

Unnamed: 0,startup_age_in_years,funding_rounds,age_last_funding_year,funding_age_years,years_since_last_funding,milestones,age_first_milestone_year,age_last_milestone_year,relationships,has_angel,avg_participants,status
0,22,1,5,0,17,1,1,1,3,0,4.0,0


In [16]:
print(model.predict(new_data))

[1]


In [17]:
import pickle 
with open('final_.sav',mode='wb') as f:
    pickle.dump(model,f)