# 1. Importing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# 2. Loading the Data

In [2]:
df = pd.read_csv("companies.csv")

# 3. Normalize Column Names 

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Display basic info
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196553 entries, 0 to 196552
Data columns (total 44 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   196553 non-null  object 
 1   unnamed:_0.1         196553 non-null  int64  
 2   entity_type          196553 non-null  object 
 3   entity_id            196553 non-null  int64  
 4   parent_id            0 non-null       float64
 5   name                 196531 non-null  object 
 6   normalized_name      196527 non-null  object 
 7   permalink            196553 non-null  object 
 8   category_code        123186 non-null  object 
 9   status               196553 non-null  object 
 10  founded_at           91227 non-null   object 
 11  closed_at            2620 non-null    object 
 12  domain               126545 non-null  object 
 13  homepage_url         126545 non-null  object 
 14  twitter_username     80591 non-null   object 
 15  logo_url         

Unnamed: 0,id,unnamed:_0.1,entity_type,entity_id,parent_id,name,normalized_name,permalink,category_code,status,...,first_milestone_at,last_milestone_at,milestones,relationships,created_by,created_at,updated_at,lat,lng,roi
0,c:1,0,Company,1,,Wetpaint,wetpaint,/company/wetpaint,web,operating,...,2010-09-05,2013-09-18,5.0,17.0,initial-importer,2007-05-25 06:51:27,2013-04-13 03:29:00,47.606209,-122.332071,15.5
1,c:10,1,Company,10,,Flektor,flektor,/company/flektor,games_video,acquired,...,,,,6.0,initial-importer,2007-05-31 21:11:51,2008-05-23 23:23:14,34.021122,-118.396467,
2,c:100,2,Company,100,,There,there,/company/there,games_video,acquired,...,2003-02-01,2011-09-23,4.0,12.0,initial-importer,2007-08-06 23:52:45,2013-11-04 02:09:48,37.562992,-122.325525,
3,c:10000,3,Company,10000,,MYWEBBO,mywebbo,/company/mywebbo,network_hosting,operating,...,,,,,,2008-08-24 16:51:57,2008-09-06 14:19:18,,,
4,c:10001,4,Company,10001,,THE Movie Streamer,the movie streamer,/company/the-movie-streamer,games_video,operating,...,,,,,,2008-08-24 17:10:34,2008-09-06 14:19:18,,,


# 4. Drop Irrelevant Columns

In [4]:
columns_to_drop = [
    'Unnamed: 0.1', 'created_at', 'created_by', 'updated_at',
    'logo_url', 'logo_width', 'logo_height',
    'overview', 'permalink'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# 5. Remove Rows Missing Essential Identifiers

In [5]:
df = df.dropna(subset=['name', 'status'])

# 6. Fill Non-Critical Missing Values with Placeholders

In [6]:
fill_na_with_placeholder = {
    'description': 'Not Provided',
    'short_description': 'Not Provided',
    'country_code': 'Unknown',
    'state_code': 'Unknown',
    'city': 'Unknown',
    'region': 'Unknown'
}
df.fillna(value=fill_na_with_placeholder, inplace=True)

# 7. Parse Date and Numeric Columns

In [7]:
date_columns = ['founded_at', 'first_funding_at', 'last_milestone_at']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

numeric_columns = ['funding_total_usd', 'investment_rounds', 'roi']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 8. Ensure Categorical Columns Are Strings

In [8]:
categorical_columns = ['status', 'category_code', 'city']
for col in categorical_columns:
    df[col] = df[col].astype(str)

# 9. Clean Up Geographical Coordinates

In [9]:
df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
df['lng'] = pd.to_numeric(df['lng'], errors='coerce')

# 10. Feature Engineering

In [10]:
# Calculate company age
if 'founded_at' in df.columns:
    df['founded_year'] = df['founded_at'].dt.year
    df['company_age'] = 2025 - df['founded_year']

# 11. Encode Categorical Features

In [11]:
label_encodable = ['country_code', 'state_code', 'city', 'region', 'category_code']
for col in label_encodable:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col])

# 12. Added Active_Status column.

In [12]:
def label_status(status):
    if status in ['operating', 'ipo']:
        return 1  # Active
    elif status in ['acquired', 'closed']:
        return 0  # Not Active
    else:
        return np.nan  # unknown

df['active_status'] = df['status'].apply(label_status)

# Drop rows with unknown status labels
df.dropna(subset=['active_status'], inplace=True)
df['active_status'] = df['active_status'].astype(int)


In [13]:
df.columns

Index(['id', 'unnamed:_0.1', 'entity_type', 'entity_id', 'parent_id', 'name',
       'normalized_name', 'category_code', 'status', 'founded_at', 'closed_at',
       'domain', 'homepage_url', 'twitter_username', 'short_description',
       'description', 'tag_list', 'country_code', 'state_code', 'city',
       'region', 'first_investment_at', 'last_investment_at',
       'investment_rounds', 'invested_companies', 'first_funding_at',
       'last_funding_at', 'funding_rounds', 'funding_total_usd',
       'first_milestone_at', 'last_milestone_at', 'milestones',
       'relationships', 'lat', 'lng', 'roi', 'founded_year', 'company_age',
       'active_status'],
      dtype='object')

# 13. Save the Cleaned Data into CSV

In [58]:
df.to_csv("Data Preprocessing_companies.csv", index=False)

In [14]:
df

Unnamed: 0,id,unnamed:_0.1,entity_type,entity_id,parent_id,name,normalized_name,category_code,status,founded_at,...,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,roi,founded_year,company_age,active_status
0,c:1,0,Company,1,,Wetpaint,wetpaint,42,operating,2005-10-17,...,2010-09-05,2013-09-18,5.0,17.0,47.606209,-122.332071,15.5,2005.0,20.0,1
1,c:10,1,Company,10,,Flektor,flektor,12,acquired,NaT,...,,NaT,,6.0,34.021122,-118.396467,,,,0
2,c:100,2,Company,100,,There,there,12,acquired,NaT,...,2003-02-01,2011-09-23,4.0,12.0,37.562992,-122.325525,,,,0
3,c:10000,3,Company,10000,,MYWEBBO,mywebbo,26,operating,2008-07-26,...,,NaT,,,,,,2008.0,17.0,1
4,c:10001,4,Company,10001,,THE Movie Streamer,the movie streamer,12,operating,2008-07-26,...,,NaT,,,,,,2008.0,17.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,c:99940,196548,Company,99940,,Webgility,webgility,7,operating,2007-01-01,...,2013-08-12,2013-08-12,2.0,5.0,37.774929,-122.419415,,2007.0,18.0,1
196549,c:9995,196549,Company,9995,,FohBoh,fohboh,32,operating,2007-11-01,...,2013-05-01,2013-05-01,1.0,14.0,37.338208,-121.886329,,2007.0,18.0,1
196550,c:9996,196550,Company,9996,,CSC,csc,5,operating,1959-01-01,...,2012-01-01,2013-10-30,3.0,44.0,38.882334,-77.171091,,1959.0,66.0,1
196551,c:9997,196551,Company,9997,,Top-candidate,top candidate,34,operating,2008-07-01,...,,NaT,,1.0,34.052234,-118.243685,,2008.0,17.0,1
