Step 1: Import Libraries


In [6]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report

from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'

Step 2: Load and Explore Data


In [9]:
# Load Datasets
ipo = pd.read_csv("ipo.csv")
cleaned_ipo = pd.read_csv("cleaned_ipo_data 2022-25.csv")

# Display First Few Rows
print(ipo.head())
print(cleaned_ipo.head(380))

# Check for Missing Values
print(ipo.isnull().sum())
print(cleaned_ipo.isnull().sum())

   Unnamed: 0 Unnamed: 0_level_0    Issue Details          Issue Details.1  \
0         NaN               Date         IPO Name  Issue Size  (in crores)   
1         0.0         17-10-2022  Electronics Mar                      500   
2         1.0         26-09-2022  Harsha Engineer                      755   
3         2.0         15-09-2022              TMB                      792   
4         3.0         06-09-2022  Dreamfolks Serv                    562.1   

  Subscription Subscription.1 Subscription.2 Subscription.3  Price  \
0          QIB            HNI            RII          Total  Issue   
1        58.81          15.39           8.27          24.23     59   
2       113.82          40.36          12.44          47.19    330   
3         0.51           1.77           3.44           1.39    525   
4        27.48          14.18          24.19          23.25    326   

        Price.1        Price.2           Price.3 Price.4             Price.5  
0  Listing Open  Listing Close 

Combine datasets


In [10]:
from fuzzywuzzy import process

# Fuzzy Match Company Names
def fuzzy_merge(df1, df2, key1, key2, threshold=80):
    matches = []
    for name in df1[key1]:
        match, score = process.extractOne(name, df2[key2])
        if score >= threshold:
            matches.append(match)
        else:
            matches.append(None)
    df1['matched_name'] = matches
    return df1.merge(df2, left_on='matched_name', right_on=key2, how='inner')

combined = fuzzy_merge(ipo, cleaned_ipo, 'IPO Name', 'Name')

# Display Combined Dataset
print(combined.head())

ModuleNotFoundError: No module named 'fuzzywuzzy'

 preprocess the data.

In [11]:
# Target Variable (Binary)
combined['Target'] = ((combined['Listing Gains(%)'].fillna(0) > 0) | (combined['Returns'].fillna(0) > 0)).astype(int)

# Log Transform Skewed Features
combined['Log_Issue_Size'] = np.log1p(combined['Issue Size (in crores)'])

# Investor Category (Categorical)
combined['Investor_Category'] = pd.cut(
    combined['RII'], 
    bins=[-np.inf, 50, 100, np.inf], 
    labels=['Retail', 'HNI', 'Institutional']
)

# Drop Unnecessary Columns
combined = combined[['Log_Issue_Size', 'Total', 'Investor_Category', 'Target']]

NameError: name 'combined' is not defined

Step 5: Train-Test Split

In [12]:
# Define Features and Target
X = combined.drop(columns=['Target'])
y = combined['Target']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

NameError: name 'combined' is not defined

Step 6: Build Pipeline

In [13]:
# Preprocessing Pipelines
numerical_features = ['Log_Issue_Size', 'Total']
categorical_features = ['Investor_Category']

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Full Pipeline with Model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42))
])

NameError: name 'XGBClassifier' is not defined

Step 7: Hyperparameter Tuning

In [15]:
# Hyperparameter Grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2]
}

# Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)

NameError: name 'model' is not defined

Step 8: Evaluate the Model

In [16]:
# Predict Probabilities
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC: {roc_auc:.2f}")

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.legend()
plt.show()

# Classification Report
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

NameError: name 'grid_search' is not defined

Step 9: Deploy the Model

In [17]:
# Save Model
import joblib
joblib.dump(grid_search, "ipo_model.pkl")



NameError: name 'grid_search' is not defined

Step 10: Run the Streamlit App

In [18]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)