Step 1: Data Cleaning and EDA


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder.appName("StartupSuccessPrediction").getOrCreate()

# Load the cleaned dataset
df = spark.read.csv('data/cleaned_investments_VC.csv', header=True, inferSchema=True)

# Strip whitespace from column names
df = df.select([col(c).alias(c.strip()) for c in df.columns])

# Display the schema of the dataset
df.printSchema()

# Cast funding_total_usd to a numeric type
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))

# Ensure other columns are of the correct type
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Handle missing values
df = df.na.drop()

# Feature Engineering
df = df.withColumn("is_successful", when(col("status") == "operating", 1).otherwise(0))

# Selecting relevant features for prediction
selected_columns = ['funding_total_usd', 'founded_year', 'first_funding_at', 'last_funding_at', 'is_successful']
df = df.select(selected_columns)

# Convert to Pandas DataFrame and save for Tableau visualization
df_pd = df.toPandas()
df_pd.to_csv('data/engineered_investments_VC.csv', index=False)


root
 |-- permalink: string (nullable = true)
 |-- name: string (nullable = true)
 |-- homepage_url: string (nullable = true)
 |-- category_list: string (nullable = true)
 |-- market: string (nullable = true)
 |-- funding_total_usd: string (nullable = true)
 |-- status: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: double (nullable = true)
 |-- founded_at: date (nullable = true)
 |-- founded_month: timestamp (nullable = true)
 |-- founded_quarter: string (nullable = true)
 |-- founded_year: double (nullable = true)
 |-- first_funding_at: date (nullable = true)
 |-- last_funding_at: date (nullable = true)
 |-- seed: double (nullable = true)
 |-- venture: double (nullable = true)
 |-- equity_crowdfunding: double (nullable = true)
 |-- undisclosed: double (nullable = true)
 |-- convertible_note: double (nullable = true)
 |-- debt_f

Feature Engineering and Data Transformation using PySpark


In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder.appName("StartupSuccessPrediction").getOrCreate()

# Load the cleaned dataset
df = spark.read.csv('data/cleaned_investments_VC.csv', header=True, inferSchema=True)

# Strip whitespace from column names
df = df.select([col(c).alias(c.strip()) for c in df.columns])

# Display the schema of the dataset
df.printSchema()

# Cast funding_total_usd to a numeric type
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))

# Ensure other columns are of the correct type
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Handle missing values
df = df.na.drop()

# Feature Engineering
df = df.withColumn("is_successful", when(col("status") == "operating", 1).otherwise(0))

# Selecting relevant features for prediction
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'first_funding_at', 'last_funding_at', 'is_successful']
df = df.select(selected_columns)

# Convert to Pandas DataFrame and save for Tableau visualization
df_pd = df.toPandas()
df_pd.to_csv('data/engineered_investments_VC.csv', index=False)


root
 |-- permalink: string (nullable = true)
 |-- name: string (nullable = true)
 |-- homepage_url: string (nullable = true)
 |-- category_list: string (nullable = true)
 |-- market: string (nullable = true)
 |-- funding_total_usd: string (nullable = true)
 |-- status: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: double (nullable = true)
 |-- founded_at: date (nullable = true)
 |-- founded_month: timestamp (nullable = true)
 |-- founded_quarter: string (nullable = true)
 |-- founded_year: double (nullable = true)
 |-- first_funding_at: date (nullable = true)
 |-- last_funding_at: date (nullable = true)
 |-- seed: double (nullable = true)
 |-- venture: double (nullable = true)
 |-- equity_crowdfunding: double (nullable = true)
 |-- undisclosed: double (nullable = true)
 |-- convertible_note: double (nullable = true)
 |-- debt_f

Machine Learning to Predict Startup Success using PySpark


In [29]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# Ensure necessary columns are cast to the correct types
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Selecting relevant features for prediction
# Replace 'name' with the correct column name
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'is_successful']
df = df.select(selected_columns)

# VectorAssembler to combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols=['funding_total_usd', 'founded_year'], outputCol='features')

# Check if 'features' column already exists and drop it if necessary
if 'features' in df.columns:
    df = df.drop('features')

df = assembler.transform(df)

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Logistic Regression model
lr = LogisticRegression(labelCol='is_successful', featuresCol='features')
model = lr.fit(train_df)

# Predictions on the test set
predictions = model.transform(test_df)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='is_successful')
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Convert predictions to Pandas DataFrame and save for Tableau visualizatione
# Convert predictions and other columns to Pandas DataFrame and save for Tableau visualization
predictions_pd = predictions.select("name", "funding_total_usd", "founded_year", "prediction", "probability", "is_successful").toPandas()
predictions_pd.to_csv('data/predictions_investments_VC.csv', index=False)

Model Accuracy: 1.0
