## Building Machine Learning Pipeline on Startup Acquisition

In [1]:
import pandas as pd
import numpy as np

# Load your dataset
data = pd.read_csv('companies.csv')

# Display the first few rows of the dataset
print(data.head())

        id  Unnamed: 0.1 entity_type  entity_id  parent_id  \
0      c:1             0     Company          1        NaN   
1     c:10             1     Company         10        NaN   
2    c:100             2     Company        100        NaN   
3  c:10000             3     Company      10000        NaN   
4  c:10001             4     Company      10001        NaN   

                 name     normalized_name                    permalink  \
0            Wetpaint            wetpaint            /company/wetpaint   
1             Flektor             flektor             /company/flektor   
2               There               there               /company/there   
3             MYWEBBO             mywebbo             /company/mywebbo   
4  THE Movie Streamer  the movie streamer  /company/the-movie-streamer   

     category_code     status  ... first_milestone_at last_milestone_at  \
0              web  operating  ...           9/5/2010         9/18/2013   
1      games_video   acquired  .

### Checking the percentage of NaN(null values) values present in each feature

In [2]:
# Find and count NaN values in the entire dataset
nan_count = data.isnull().sum()
print(nan_count)

id                          0
Unnamed: 0.1                0
entity_type                 0
entity_id                   0
parent_id              196553
name                       22
normalized_name            26
permalink                   0
category_code           73367
status                      0
founded_at             105326
closed_at              193933
domain                  70008
homepage_url            70008
twitter_username       115962
logo_url                86443
logo_width              86443
logo_height             86443
short_description      189422
description            104505
overview                69582
tag_list               115101
country_code           108563
state_code             145650
city                   112663
region                      0
first_investment_at    193970
last_investment_at     193970
investment_rounds      193962
invested_companies     193962
first_funding_at       165046
last_funding_at        165046
funding_rounds         164846
funding_to

In [3]:
# Calculate the percentage of NaN values in each column
nan_percentage = (data.isnull().sum() / len(data)) * 100
print(nan_percentage)

id                       0.000000
Unnamed: 0.1             0.000000
entity_type              0.000000
entity_id                0.000000
parent_id              100.000000
name                     0.011193
normalized_name          0.013228
permalink                0.000000
category_code           37.326828
status                   0.000000
founded_at              53.586564
closed_at               98.667026
domain                  35.617874
homepage_url            35.617874
twitter_username        58.997828
logo_url                43.979486
logo_width              43.979486
logo_height             43.979486
short_description       96.371971
description             53.168865
overview                35.401139
tag_list                58.559778
country_code            55.233448
state_code              74.102151
city                    57.319400
region                   0.000000
first_investment_at     98.685851
last_investment_at      98.685851
investment_rounds       98.681780
invested_compa

### Removing duplicates

In [4]:
# Remove duplicate rows
data_no_duplicates = data.drop_duplicates()
print(data_no_duplicates)

             id  Unnamed: 0.1 entity_type  entity_id  parent_id  \
0           c:1             0     Company          1        NaN   
1          c:10             1     Company         10        NaN   
2         c:100             2     Company        100        NaN   
3       c:10000             3     Company      10000        NaN   
4       c:10001             4     Company      10001        NaN   
...         ...           ...         ...        ...        ...   
196548  c:99940        196548     Company      99940        NaN   
196549   c:9995        196549     Company       9995        NaN   
196550   c:9996        196550     Company       9996        NaN   
196551   c:9997        196551     Company       9997        NaN   
196552   c:9998        196552     Company       9998        NaN   

                      name     normalized_name                    permalink  \
0                 Wetpaint            wetpaint            /company/wetpaint   
1                  Flektor           

### Dropping columns which have NaN values.

In [5]:
# Step 3: Dropping columns with NaN values
threshold_percentage = 30  # A threshold between 20% and 30% is often used as a starting point.
columns_with_high_nan = nan_percentage[nan_percentage > threshold_percentage].index
data_cleaned = data_no_duplicates.drop(columns=columns_with_high_nan)
print("Columns with high NaN percentages removed:", columns_with_high_nan)

Columns with high NaN percentages removed: Index(['parent_id', 'category_code', 'founded_at', 'closed_at', 'domain',
       'homepage_url', 'twitter_username', 'logo_url', 'logo_width',
       'logo_height', 'short_description', 'description', 'overview',
       'tag_list', 'country_code', 'state_code', 'city', 'first_investment_at',
       'last_investment_at', 'investment_rounds', 'invested_companies',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'first_milestone_at', 'last_milestone_at',
       'milestones', 'relationships', 'lat', 'lng', 'ROI'],
      dtype='object')


### Remove unnecessary and corrupted data.

In [6]:
#Remove unnecessary columns
columns_to_keep = ['id', 'entity_type', 'entity_id', 'parent_id', 'name', 'normalized_name',
    'permalink', 'category_code', 'status', 'founded_at', 'closed_at',
    'domain', 'homepage_url', 'twitter_username', 'short_description',
    'description', 'overview', 'tag_list', 'country_code', 'state_code',
    'city', 'region', 'first_investment_at', 'last_investment_at',
    'investment_rounds', 'invested_companies', 'first_funding_at',
    'last_funding_at', 'funding_rounds', 'funding_total_usd',
    'first_milestone_at', 'last_milestone_at', 'milestones',
    'relationships', 'lat', 'lng', 'ROI']

data_cleaned = data[columns_to_keep]


# Remove rows based on specific criteria
# Remove rows with negative ROI
data_cleaned = data_cleaned[data_cleaned['ROI'] >= 0]

# Display the cleaned DataFrame
print(data_cleaned)

              id entity_type  entity_id  parent_id           name  \
0            c:1     Company          1        NaN       Wetpaint   
13        c:1001     Company       1001        NaN     FriendFeed   
81        c:1007     Company       1007        NaN        Rupture   
210     c:101519     Company     101519        NaN       Vidacare   
221      c:10158     Company      10158        NaN      StudioNow   
...          ...         ...        ...        ...            ...   
196246    c:9737     Company       9737        NaN     AxisMobile   
196287    c:9775     Company       9775        NaN          Voxeo   
196496    c:9949     Company       9949        NaN           PPTV   
196518   c:99685     Company      99685        NaN  Symbyo Dental   
196519     c:997     Company        997        NaN         Zimbra   

       normalized_name               permalink    category_code     status  \
0             wetpaint       /company/wetpaint              web  operating   
13          fri

In [7]:
# Remove rows with negative funding_total_usd.
data_cleaned = data_cleaned[data_cleaned['funding_total_usd'] >= 0]
print(data_cleaned)

              id entity_type  entity_id  parent_id           name  \
0            c:1     Company          1        NaN       Wetpaint   
13        c:1001     Company       1001        NaN     FriendFeed   
81        c:1007     Company       1007        NaN        Rupture   
210     c:101519     Company     101519        NaN       Vidacare   
221      c:10158     Company      10158        NaN      StudioNow   
...          ...         ...        ...        ...            ...   
196246    c:9737     Company       9737        NaN     AxisMobile   
196287    c:9775     Company       9775        NaN          Voxeo   
196496    c:9949     Company       9949        NaN           PPTV   
196518   c:99685     Company      99685        NaN  Symbyo Dental   
196519     c:997     Company        997        NaN         Zimbra   

       normalized_name               permalink    category_code     status  \
0             wetpaint       /company/wetpaint              web  operating   
13          fri