In [1]:
# python_cleaning_analysis.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load Data
df = pd.read_csv('startup_funding_analysis.csv')

# 2. Initial Data Exploration
print("DataFrame Info:")
print(df.info())
print("\nMissing Values per Column:")
print(df.isnull().sum())
print("\nDescriptive Statistics:")
print(df.describe(include='all'))

# 3. Data Cleaning

# Standardize text fields (example: city names)
df['city'] = df['city'].str.strip().str.title()

# Handle missing values
# Example: Fill missing employee_growth with median, drop rows with missing company_name
if df['employee_growth'].isnull().sum() > 0:
    median_growth = df['employee_growth'].median()
    df['employee_growth'].fillna(median_growth, inplace=True)
df.dropna(subset=['company_name'], inplace=True)

# Convert founded column to datetime and calculate company age
if 'founded' in df.columns:
    df['founded'] = pd.to_datetime(df['founded'], errors='coerce')
    df['company_age'] = pd.Timestamp('now').year - df['founded'].dt.year

# Remove duplicates
before = len(df)
df.drop_duplicates(inplace=True)
after = len(df)
print(f"Removed {before - after} duplicate rows.")

# 4. Statistical Insights

# Summary statistics for key variables
print("\nEmployee Growth Stats:")
print(df['employee_growth'].describe())
print("\nFunding Stats:")
if 'total_funding' in df.columns:
    print(df['total_funding'].describe())

# Correlation analysis
if 'total_funding' in df.columns:
    print("\nCorrelation between employee growth and funding:")
    print(df[['employee_growth', 'total_funding']].corr())

# 5. Visualization (for Step 2.11, but included here for completeness)

# Histogram of employee growth
plt.figure(figsize=(8, 4))
sns.histplot(df['employee_growth'], bins=30, kde=True)
plt.title('Distribution of Employee Growth')
plt.xlabel('Employee Growth (%)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('employee_growth_distribution.png')
plt.close()

# Scatter plot: Employee Growth vs. Total Funding
if 'total_funding' in df.columns:
    plt.figure(figsize=(8, 4))
    sns.scatterplot(data=df, x='total_funding', y='employee_growth')
    plt.title('Employee Growth vs. Total Funding')
    plt.xlabel('Total Funding ($)')
    plt.ylabel('Employee Growth (%)')
    plt.tight_layout()
    plt.savefig('growth_vs_funding.png')
    plt.close()

# 6. Save Cleaned Data
df.to_csv('startup_funding_analysis_cleaned.csv', index=False)

print("Data cleaning and analysis complete. Cleaned data saved to 'startup_funding_analysis_cleaned.csv'.")


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       846 non-null    object 
 1   city               838 non-null    object 
 2   state              846 non-null    object 
 3   current_employees  842 non-null    float64
 4   last_employees     836 non-null    float64
 5   employee_growth    836 non-null    float64
 6   founded            676 non-null    float64
 7   total_funding      843 non-null    float64
 8   linkedin_url       845 non-null    object 
 9   url                846 non-null    object 
dtypes: float64(5), object(5)
memory usage: 66.2+ KB
None

Missing Values per Column:
company_name           0
city                   8
state                  0
current_employees      4
last_employees        10
employee_growth       10
founded              170
total_funding          3
linkedin_url     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employee_growth'].fillna(median_growth, inplace=True)
