In [36]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Load your dataset
df = pd.read_csv('startup.csv',encoding='latin1')
print(df.columns)

# Drop rows where the target variable 'status' is missing
df_cleaned = df.dropna(subset=['status'])

# Clean all column names by removing leading/trailing spaces
df.columns = df.columns.str.strip()

# Replace non-numeric values in 'funding_total_usd' and convert to float
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'].str.replace(',', '').replace(' -   ', np.nan), errors='coerce')

# Impute missing values in 'funding_total_usd' using the median
df['funding_total_usd'].fillna(df['funding_total_usd'].median(), inplace=True)

# Drop rows where the target variable 'status' is missing
df_cleaned = df.dropna(subset=['status'])

# Impute remaining missing values for numeric columns like 'funding_rounds' with the median
df_cleaned['funding_rounds'].fillna(df_cleaned['funding_rounds'].median(), inplace=True)


df_cleaned['market'].fillna(df_cleaned['market'].mode()[0], inplace=True)
df_cleaned['country_code'].fillna(df_cleaned['country_code'].mode()[0], inplace=True)
df_cleaned['region'].fillna(df_cleaned['region'].mode()[0], inplace=True)

# dates:

# Convert the date columns to datetime format
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['first_funding_at'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['last_funding_at'] = pd.to_datetime(df['last_funding_at'], errors='coerce')

# Check for remaining missing values
print(df_cleaned.isnull().sum())



Index(['permalink', 'name', 'homepage_url', 'category_list', ' market ',
       ' funding_total_usd ', 'status', 'country_code', 'state_code', 'region',
       'city', 'funding_rounds', 'founded_at', 'founded_month',
       'founded_quarter', 'founded_year', 'first_funding_at',
       'last_funding_at', 'seed', 'venture', 'equity_crowdfunding',
       'undisclosed', 'convertible_note', 'debt_financing', 'angel', 'grant',
       'private_equity', 'post_ipo_equity', 'post_ipo_debt',
       'secondary_market', 'product_crowdfunding', 'round_A', 'round_B',
       'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H'],
      dtype='object')
permalink                   0
name                        1
homepage_url             3377
category_list            3582
market                      0
funding_total_usd           0
status                      0
country_code                0
state_code              18574
region                      0
city                     5857
funding_rounds 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['funding_rounds'].fillna(df_cleaned['funding_rounds'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['market'].fillna(df_cleaned['market'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['country_code'].fillna(df_cleaned['country_code'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.

In [40]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Display the resulting DataFrame with numeric columns
print(numeric_df.head())

   funding_total_usd  funding_rounds  founded_year       seed    venture  \
0          1750000.0             1.0        2012.0  1750000.0        0.0   
1          4000000.0             2.0           NaN        0.0  4000000.0   
2            40000.0             1.0        2012.0    40000.0        0.0   
3          1500000.0             1.0        2011.0  1500000.0        0.0   
4            60000.0             2.0        2014.0        0.0        0.0   

   equity_crowdfunding  undisclosed  convertible_note  debt_financing  angel  \
0                  0.0          0.0               0.0             0.0    0.0   
1                  0.0          0.0               0.0             0.0    0.0   
2                  0.0          0.0               0.0             0.0    0.0   
3                  0.0          0.0               0.0             0.0    0.0   
4              60000.0          0.0               0.0             0.0    0.0   

   grant  private_equity  post_ipo_equity  post_ipo_debt  seco

In [45]:
# Include 'status' column for target variable
target_column = 'status'

# Select only numeric columns and include 'status'
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
if 'founded_year' in numeric_columns:
    numeric_columns.remove('founded_year')

# Include the 'market' column
categorical_columns = ['market']

# One-hot encode the 'market' column
df_encoded = pd.get_dummies(df[categorical_columns], drop_first=True)

# Combine numeric columns with the encoded categorical columns and include 'status'
df_combined = pd.concat([df[numeric_columns + [target_column]], df_encoded], axis=1)

# Drop rows with NaN values
df_combined.dropna(inplace=True)

# Target variable
y = df_combined[target_column]
X = df_combined.drop(columns=[target_column])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Get explained variance and create a DataFrame
explained_variance = pca.explained_variance_ratio_
scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Explained Variance': explained_variance
})
print(scores_df)


                                            Feature  Explained Variance
0                                 funding_total_usd        4.322862e-03
1                                    funding_rounds        2.572234e-03
2                                              seed        2.436170e-03
3                                           venture        1.697899e-03
4                               equity_crowdfunding        1.588916e-03
5                                       undisclosed        1.560882e-03
6                                  convertible_note        1.538833e-03
7                                    debt_financing        1.464907e-03
8                                             angel        1.459201e-03
9                                             grant        1.453363e-03
10                                   private_equity        1.431905e-03
11                                  post_ipo_equity        1.419921e-03
12                                    post_ipo_debt        1.404

In [47]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Select top 10 features based on explained variance
top_features_indices = scores_df.nlargest(10, 'Explained Variance').index
X_top = X_pca[:, top_features_indices]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Set up the SVM model with hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Best Parameters:", grid_search.best_params_)

print("Accuracy Score:", accuracy)

Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy Score: 0.8687792207792208
