Step 1: Data Cleaning and EDA


In [53]:
import pandas as pd

# Load the dataset
file_path = 'investments_VC.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Strip whitespace from column names
data.columns = [col.strip() for col in data.columns]

# Replace commas and '-' characters, and convert to numeric
data['funding_total_usd'] = pd.to_numeric(data['funding_total_usd'].str.replace(',', '').replace('-', ''), errors='coerce')

# Display the first few rows of the dataset
print(data.head())

# Basic information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Drop columns with more than 50% missing values
threshold = len(data) * 0.5
data = data.dropna(thresh=threshold, axis=1)

# Fill missing values with appropriate strategies (mean, median, mode, etc.)
data = data.fillna(method='ffill').fillna(method='bfill')

# Display basic statistics
print(data.describe())

# Save for Tableau visualization
data.to_csv('data/cleaned_investments_VC.csv', index=False)

                         permalink                name  \
0            /organization/waywire            #waywire   
1  /organization/tv-communications  &TV Communications   
2    /organization/rock-your-paper   'Rock' Your Paper   
3   /organization/in-touch-network   (In)Touch Network   
4   /organization/r-ranch-and-mine  -R- Ranch and Mine   

                    homepage_url  \
0         http://www.waywire.com   
1          http://enjoyandtv.com   
2   http://www.rockyourpaper.org   
3  http://www.InTouchNetwork.com   
4                            NaN   

                                       category_list         market  \
0         |Entertainment|Politics|Social Media|News|          News    
1                                            |Games|         Games    
2                             |Publishing|Education|    Publishing    
3  |Electronics|Guides|Coffee|Restaurants|Music|i...   Electronics    
4                      |Tourism|Entertainment|Games|       Tourism    

   fund

  data = data.fillna(method='ffill').fillna(method='bfill')


       funding_total_usd  funding_rounds  founded_year          seed  \
count       5.429400e+04    54294.000000  54294.000000  5.429400e+04   
mean        1.817699e+07        1.902254   2006.533245  1.978845e+05   
std         1.485455e+08        1.399079      7.725971  1.010514e+06   
min         1.000000e+00        1.000000   1902.000000  0.000000e+00   
25%         4.318845e+05        1.000000   2003.000000  0.000000e+00   
50%         2.600000e+06        1.000000   2009.000000  0.000000e+00   
75%         1.580695e+07        2.000000   2011.000000  1.000000e+04   
max         3.007950e+10       18.000000   2014.000000  1.300000e+08   

            venture  equity_crowdfunding   undisclosed  convertible_note  \
count  5.429400e+04         5.429400e+04  5.429400e+04      5.429400e+04   
mean   9.334456e+06         5.612081e+03  1.185744e+05      2.127444e+04   
std    2.779075e+07         1.907637e+05  2.845195e+06      1.366521e+06   
min    0.000000e+00         0.000000e+00  0.000

Feature Engineering and Data Transformation using PySpark


In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Initialize Spark session
spark = SparkSession.builder.appName("StartupSuccessPrediction").getOrCreate()

# Load the cleaned dataset
df = spark.read.csv('data/cleaned_investments_VC.csv', header=True, inferSchema=True)

# Strip whitespace from column names
df = df.select([col(c).alias(c.strip()) for c in df.columns])

# Display the schema of the dataset
df.printSchema()

# Handling missing values
df = df.na.drop()

# Feature Engineering
# Example: Creating a new column 'is_successful' based on funding rounds
df = df.withColumn("is_successful", when(col("status") == "operating", 1).otherwise(0))

# Selecting relevant features for prediction
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'first_funding_at', 'last_funding_at', 'is_successful']
df = df.select(selected_columns)

# Convert to Pandas DataFrame and save as CSV
df_pd = df.toPandas()
df_pd.to_csv('data/engineered_investments_VC.csv', index=False)


root
 |-- permalink: string (nullable = true)
 |-- name: string (nullable = true)
 |-- homepage_url: string (nullable = true)
 |-- category_list: string (nullable = true)
 |-- market: string (nullable = true)
 |-- funding_total_usd: double (nullable = true)
 |-- status: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: double (nullable = true)
 |-- founded_at: date (nullable = true)
 |-- founded_month: timestamp (nullable = true)
 |-- founded_quarter: string (nullable = true)
 |-- founded_year: double (nullable = true)
 |-- first_funding_at: date (nullable = true)
 |-- last_funding_at: date (nullable = true)
 |-- seed: double (nullable = true)
 |-- venture: double (nullable = true)
 |-- equity_crowdfunding: double (nullable = true)
 |-- undisclosed: double (nullable = true)
 |-- convertible_note: double (nullable = true)
 |-- debt_f

Machine Learning to Predict Startup Success using PySpark


In [55]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# Ensure necessary columns are cast to the correct types
df = df.withColumn("funding_total_usd", col("funding_total_usd").cast("double"))
df = df.withColumn("founded_year", col("founded_year").cast("integer"))

# Fill null values
df = df.fillna({'funding_total_usd': 0, 'founded_year': 0})

# Selecting relevant features for prediction
# Replace 'name' with the correct column name
selected_columns = ['name', 'funding_total_usd', 'founded_year', 'is_successful']
df = df.select(selected_columns)

# VectorAssembler to combine feature columns into a single feature vector
assembler = VectorAssembler(inputCols=['funding_total_usd', 'founded_year'], outputCol='features')

# Check if 'features' column already exists and drop it if necessary
if 'features' in df.columns:
    df = df.drop('features')

df = assembler.transform(df)

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

# Logistic Regression model
lr = LogisticRegression(labelCol='is_successful', featuresCol='features')
model = lr.fit(train_df)

# Predictions on the test set
predictions = model.transform(test_df)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='is_successful')
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

# Convert predictions to Pandas DataFrame and save for Tableau visualization
predictions_pd = predictions.select("name", "funding_total_usd", "founded_year", "prediction", "probability", "is_successful").toPandas()
predictions_pd.to_csv('data/predictions_investments_VC.csv', index=False)

Model Accuracy: 0.5933871131272014
