### Import Librabries

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import `logistic regression` model
from sklearn.linear_model import LogisticRegression

### Load the source data

In [2]:
# Create dataframe and fill missing values
df = pd.read_csv('Competition1_raw_data.csv', header=0, na_values=['-'])
df.head(5)
df.dtypes

I1          object
I2          object
I3          object
P(IPO)     float64
P(H)       float64
P(L)       float64
P(1Day)    float64
C1         float64
C2         float64
C3         float64
C4         float64
C5         float64
C6         float64
C7         float64
T1         float64
T2         float64
T3         float64
T4         float64
T5         float64
S1         float64
S2         float64
S3         float64
dtype: object

### Analyze the Source Data

In [3]:
df.shape
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
count,677.0,672.0,672.0,660.0,660.0,660.0,646.0,660.0,676.0,676.0,610.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0
mean,13.837666,15.48119,13.515045,25.934766,149.728788,0.859091,1.788904,0.007282,49357760.0,12415190.0,500.459962,465.634361,12758.606461,11395.844347,294.353891,679.220264,68.421439,120.104258,144.759178
std,6.053731,6.653429,5.835646,73.234948,152.817467,0.348192,162.666532,0.033318,104376400.0,25128550.0,1648.337634,175.741647,5449.644597,4839.670179,121.532637,472.914323,39.096525,84.828959,69.276285
min,3.0,0.0,3.0,0.0,10.0,0.0,-786.239,-0.162352,3693227.0,525000.0,0.074,132.0,0.0,0.0,0.0,-1.0,-1.0,20.0,26.0
25%,10.0,12.5,11.0,11.0,85.0,1.0,-0.8525,-0.013927,18714170.0,5000000.0,37.24575,351.0,9195.0,8162.0,213.0,462.0,45.0,73.0,100.0
50%,13.5,15.0,13.0,14.845,107.0,1.0,0.01,0.009125,27400180.0,7398704.0,103.833,444.0,12045.0,10785.0,279.0,624.0,60.0,100.0,134.0
75%,17.0,17.0,15.0,20.485,155.25,1.0,0.47,0.031571,49807860.0,12000000.0,331.138,551.0,15241.0,13760.0,354.0,795.0,85.0,142.0,173.0
max,85.0,135.0,108.0,1159.200562,2087.0,1.0,3864.5,0.092896,2138085000.0,421233600.0,30683.0,1750.0,49056.0,43952.0,1058.0,10277.0,309.0,944.0,883.0


In [4]:
# Calculating the number of nulls in each column.
df.isnull().sum()

I1          0
I2          0
I3          8
P(IPO)      5
P(H)       10
P(L)       10
P(1Day)    22
C1         22
C2         22
C3         36
C4         22
C5          6
C6          6
C7         72
T1          1
T2          1
T3          1
T4          1
T5          1
S1          1
S2          1
S3          1
dtype: int64

#### Imputation

In [8]:
# Replace the missing data with mean/median/mode


TypeError: could not convert string to float: '7389, 5063'

In [185]:
# Add Positive EPS dummy column C3_D with dummy variable set to 1 if C3 is positive, otherwise 0. 
df['C3_D'] = pd.get_dummies(df.C3 > 0, drop_first=True, dtype=bool)

# Add Share Overhang column C5_D.
df['C5_D']=df['C5']/df['C6']

# Add Up Revision column C6_D.
df['P(mid)'] = ((df['P(H)']+df['P(L)'])/2)
df['C6_D'] = ((df['P(IPO)']-df['P(mid)'])/df['P(mid)']*0.01).where((df['P(IPO)'] > df['P(mid)']), 0)

#### Adding Target Variables

In [186]:
# Add Pre-IPO Price Revision column Y1.
df['Y1'] = np.where(df['P(IPO)'] < ((df['P(H)'] + df['P(L)'])/2), '1', '0')
df['Y1'] = df['Y1'].astype('int')

#Add Post-IPO Initial Return column Y2.
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], '1', '0')
df['Y2'] = df['Y2'].astype('int')

df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 28 columns):
I1         682 non-null object
I2         682 non-null object
I3         674 non-null object
P(IPO)     677 non-null float64
P(H)       672 non-null float64
P(L)       672 non-null float64
P(1Day)    660 non-null float64
C1         660 non-null float64
C2         660 non-null float64
C3         646 non-null float64
C4         660 non-null float64
C5         676 non-null float64
C6         676 non-null float64
C7         610 non-null float64
T1         681 non-null float64
T2         681 non-null float64
T3         681 non-null float64
T4         681 non-null float64
T5         681 non-null float64
S1         681 non-null float64
S2         681 non-null float64
S3         681 non-null float64
C3_D       682 non-null bool
C5_D       676 non-null float64
P(mid)     672 non-null float64
C6_D       682 non-null float64
Y1         682 non-null int64
Y2         682 non-null int64
dtypes: b

Unnamed: 0,I1,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,...,T5,S1,S2,S3,C3_D,C5_D,P(mid),C6_D,Y1,Y2
0,AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,...,690.0,62.0,117.0,139.0,True,3.864345,9.0,0.001111,0,1
1,ABPI,ACCENTIA BIOPHARMACEUTICALS INC,2834,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,...,1120.0,71.0,242.0,237.0,False,12.028832,9.0,0.0,1,0
2,ACAD,ACADIA PHARMACEUTICALS INC,2834,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,...,325.0,61.0,33.0,60.0,False,3.369134,13.0,0.0,1,0
3,ACHN,ACHILLION PHARMACEUTICALS INC,2834,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,...,509.0,80.0,59.0,110.0,False,3.299697,15.0,0.0,1,1
4,ACLI,AMERICAN COMMERCIAL LINES INC.,4492,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,...,720.0,67.0,149.0,167.0,True,3.726269,20.0,0.0005,0,1


In [187]:
# Change the data types 
df['Y1'] = df['Y1'].astype('bool')
df['Y2'] = df['Y2'].astype('bool')
df['C2'] = df['C2'].astype('bool')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 28 columns):
I1         682 non-null object
I2         682 non-null object
I3         674 non-null object
P(IPO)     677 non-null float64
P(H)       672 non-null float64
P(L)       672 non-null float64
P(1Day)    660 non-null float64
C1         660 non-null float64
C2         682 non-null bool
C3         646 non-null float64
C4         660 non-null float64
C5         676 non-null float64
C6         676 non-null float64
C7         610 non-null float64
T1         681 non-null float64
T2         681 non-null float64
T3         681 non-null float64
T4         681 non-null float64
T5         681 non-null float64
S1         681 non-null float64
S2         681 non-null float64
S3         681 non-null float64
C3_D       682 non-null bool
C5_D       676 non-null float64
P(mid)     672 non-null float64
C6_D       682 non-null float64
Y1         682 non-null bool
Y2         682 non-null bool
dtypes: bool(4

In [188]:
# Create column T1_D with Percent of long sentences
df['T1_D']=df['T4']/df['T1']
# Create column T3_D with Percent of real words
df['T3_D']=df['T3']/df['T2']
# Create column T5_D with Percent of long words
df['T5_D']=df['T5']/df['T2']
# Create column S1_D with Percent of Positive words
df['S1_D']=df['S1']/df['T2']
# Create column S2_D with Percent of real words
df['S2_D']=df['S2']/df['T2']
# Create column S3_D with Percent of real words
df['S3_D']=df['S3']/df['T2']

#### Creating new dataframe for predictive analysis of underpricing phenomenon.

In [189]:
# Creating new dataframe by extracting columns from existing dataframe.
data = df[['C1', 'C2', 'C3_D', 'C4', 'C5_D', 'C6_D', 'C7', 'T1_D', 'T3_D', 'T5_D', 'S1_D', 'S2_D', 'S3_D', 'Y1', 'Y2']].copy()
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 15 columns):
C1      660 non-null float64
C2      682 non-null bool
C3_D    682 non-null bool
C4      660 non-null float64
C5_D    676 non-null float64
C6_D    682 non-null float64
C7      610 non-null float64
T1_D    681 non-null float64
T3_D    681 non-null float64
T5_D    681 non-null float64
S1_D    681 non-null float64
S2_D    681 non-null float64
S3_D    681 non-null float64
Y1      682 non-null bool
Y2      682 non-null bool
dtypes: bool(4), float64(11)
memory usage: 61.4 KB


Unnamed: 0,C1,C2,C3_D,C4,C5_D,C6_D,C7,T1_D,T3_D,T5_D,S1_D,S2_D,S3_D,Y1,Y2
0,122.0,True,True,0.029074,3.864345,0.001111,51.345,0.640426,0.908876,0.05425,0.004875,0.009199,0.010929,False,True
1,259.0,False,False,-0.013352,12.028832,0.0,25.936,0.644753,0.898724,0.051395,0.003258,0.011105,0.010876,True,False
2,90.0,True,False,0.020715,3.369134,0.0,7.378,0.636816,0.90935,0.061764,0.011593,0.006271,0.011403,True,False
3,209.0,True,False,0.020023,3.299697,0.0,8.526,0.539634,0.91706,0.06163,0.009686,0.007144,0.013319,True,True
4,80.0,True,True,-0.034895,3.726269,0.0005,632.298,0.587413,0.888469,0.04855,0.004518,0.010047,0.011261,False,True


In [190]:
data.describe()

Unnamed: 0,C1,C4,C5_D,C6_D,C7,T1_D,T3_D,T5_D,S1_D,S2_D,S3_D
count,660.0,660.0,676.0,682.0,610.0,681.0,681.0,681.0,681.0,681.0,681.0
mean,149.728788,0.007282,4.6335,0.000523,500.459962,0.628061,inf,inf,inf,inf,inf
std,152.817467,0.033318,6.272977,0.001074,1648.337634,0.083593,,,,,
min,10.0,-0.162352,0.283223,0.0,0.074,0.0,0.0,-8.9e-05,-9.3e-05,0.002152,0.005125
25%,85.0,-0.013927,2.850622,0.0,37.24575,0.579767,0.883472,0.047261,0.004149,0.007031,0.009568
50%,107.0,0.009125,3.753607,0.0,103.833,0.629032,0.897225,0.051886,0.005058,0.008595,0.011302
75%,155.25,0.031571,4.902128,0.000714,331.138,0.674863,0.909752,0.056455,0.006259,0.010541,0.012891
max,2087.0,0.092896,99.787255,0.01,30683.0,1.443089,inf,inf,inf,inf,inf


Check the shape of the data.