Step 1: Data Cleaning and EDA


In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder.appName("StartupSuccessPrediction").getOrCreate()

# Load the dataset
file_path = 'investments_VC.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Strip whitespace from column names
data.columns = [col.strip() for col in data.columns]

# Replace commas and '-' characters, and convert to numeric
data['funding_total_usd'] = pd.to_numeric(data['funding_total_usd'].str.replace(',', '').replace('-', ''), errors='coerce')

# Display the first few rows of the dataset
print(data.head())

# Basic information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Drop columns with more than 50% missing values
threshold = len(data) * 0.5
data = data.dropna(thresh=threshold, axis=1)

# Fill missing values with appropriate strategies (mean, median, mode, etc.)
data = data.fillna(method='ffill').fillna(method='bfill')

# Display basic statistics
print(data.describe())

# Save for Tableau visualization
data.to_csv('data/cleaned_investments_VC.csv', index=False)

In [6]:
import pandas as pd

# Load the dataset
file_path = 'investments_VC.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Strip whitespace from column names
data.columns = [col.strip() for col in data.columns]

# Replace commas and '-' characters, and convert to numeric
data['funding_total_usd'] = pd.to_numeric(data['funding_total_usd'].str.replace(',', '').replace('-', ''), errors='coerce')

# Calculate total investment
total_investment = data['funding_total_usd'].sum()
print(f"Total Investment: {total_investment}")

# Calculate average investment
average_investment = data['funding_total_usd'].mean()
print(f"Average Investment: {average_investment}")

# Calculate median investment
median_investment = data['funding_total_usd'].median()
print(f"Median Investment: {median_investment}")

# Calculate number of unique sectors
num_sectors = data['market'].nunique()
print(f"Number of Sectors: {num_sectors}")

Total Investment: 650933703144.0
Average Investment: 15912526.05040702
Median Investment: 2000000.0
Number of Sectors: 753


Feature Engineering and Data Transformation using PySpark


In [2]:
# Load the cleaned dataset
df = spark.read.csv('data/cleaned_investments_VC.csv', header=True, inferSchema=True)

# Strip whitespace from column names
df = df.select([col(c).alias(c.strip()) for c in df.columns])

# Display the schema of the dataset
df.printSchema()

# Handling missing values
df = df.na.drop()

# Feature Engineering
# Example: Creating a new column 'is_successful' based on funding rounds
df = df.withColumn("is_successful", when(col("status") == "operating", 1).otherwise(0))

# Selecting relevant features for prediction
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'first_funding_at', 'last_funding_at', 'is_successful']
df = df.select(selected_columns)

# Convert to Pandas DataFrame and save as CSV
df_pd = df.toPandas()
df_pd.to_csv('data/engineered_investments_VC.csv', index=False)

root
 |-- permalink: string (nullable = true)
 |-- name: string (nullable = true)
 |-- homepage_url: string (nullable = true)
 |-- category_list: string (nullable = true)
 |-- market: string (nullable = true)
 |-- funding_total_usd: double (nullable = true)
 |-- status: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: double (nullable = true)
 |-- founded_at: date (nullable = true)
 |-- founded_month: timestamp (nullable = true)
 |-- founded_quarter: string (nullable = true)
 |-- founded_year: double (nullable = true)
 |-- first_funding_at: date (nullable = true)
 |-- last_funding_at: date (nullable = true)
 |-- seed: double (nullable = true)
 |-- venture: double (nullable = true)
 |-- equity_crowdfunding: double (nullable = true)
 |-- undisclosed: double (nullable = true)
 |-- convertible_note: double (nullable = true)
 |-- debt_f

Machine Learning to Predict Startup Success using PySpark


In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# Ensure necessary columns are cast to the correct types
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Fill null values
df = df.fillna({'funding_total_usd': 0, 'founded_year': 0})

# Selecting relevant features for prediction
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'is_successful']
df = df.select(selected_columns)

# Count the number of 1s and 0s in the 'is_successful' column
num_zeros = df.filter(col('is_successful') == 0).count()
num_ones = df.filter(col('is_successful') == 1).count()

# Calculate the ratio of 1s to 0s
ratio = num_zeros / num_ones

# Oversample the minority class (1s)
df_ones = df.filter(col('is_successful') == 1)
df_zeros = df.filter(col('is_successful') == 0)

df_ones_oversampled = df_ones.sample(withReplacement=True, fraction=ratio, seed=42)

# Combine the oversampled 1s and the original 0s
df = df_zeros.union(df_ones_oversampled)

# VectorAssembler to combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols=['funding_total_usd', 'founded_year'], outputCol='features')

# Check if 'features' column already exists and drop it if necessary
if 'features' in df.columns:
    df = df.drop('features')

df = assembler.transform(df)

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Logistic Regression model
lr = LogisticRegression(labelCol='is_successful', featuresCol='features')
model = lr.fit(train_df)

# Predictions on the test set
predictions = model.transform(test_df)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='is_successful')
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Rename 'is_successful' to 'actual_success' and 'prediction' to 'predicted_success'
predictions = predictions.withColumnRenamed('is_successful', 'actual_success')
predictions = predictions.withColumnRenamed('prediction', 'predicted_success')

# Print the results
predictions.select("name", "funding_total_usd", "founded_year", "predicted_success", "probability", "actual_success").show()

# Convert predictions to Pandas DataFrame and save for Tableau visualization
predictions_pd = predictions.select("name", "funding_total_usd", "founded_year", "predicted_success", "probability", "actual_success").toPandas()
predictions_pd.to_csv('data/predictions_investments_VC.csv', index=False)

Model Accuracy: 0.5852110216357815
+-------------------+-----------------+------------+-----------------+--------------------+--------------+
|               name|funding_total_usd|founded_year|predicted_success|         probability|actual_success|
+-------------------+-----------------+------------+-----------------+--------------------+--------------+
|       1000 Markets|         500000.0|        2009|              1.0|[0.48845765784664...|             0|
|          12Society|         619494.0|        2012|              1.0|[0.47175117237640...|             0|
|         1C Company|            2.0E8|        1991|              0.0|[0.57005518902110...|             0|
|              1Cast|          40000.0|        2006|              0.0|[0.50522116295291...|             0|
|        24PageBooks|          50000.0|        2010|              1.0|[0.48292893993981...|             0|
|            25eight|          25000.0|        2012|              1.0|[0.47180546621406...|             0|
| 