**Importing the Datafile**

In [29]:
# import appropriate packages
import numpy as np
import pandas as pd

In [30]:
# read the IQR excel file 
dfx = pd.read_csv("../csv_files/p1iqr.csv", index_col=0)
dfx.head(10)

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,C5p,C2,C3p
0,122.0,2.625,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,3.864345,1.0,0
1,254.375,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,7.882541,0.0,1
2,90.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,3.369134,1.0,1
3,209.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,3.299697,1.0,1
4,80.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,3.726269,1.0,0
5,94.0,0.06,-0.057556,42402916.0,7407407.0,197.591,510.0,17999.0,13934.0,328.0,815.0,62.0,96.0,181.0,5.724394,1.0,0
6,128.0,-1.87,0.004106,19047022.0,5500000.0,5.146,495.0,15758.0,13953.0,379.0,698.0,82.0,106.0,135.0,3.463095,1.0,1
7,91.0,-0.74,0.04062,19371750.0,8000000.0,500.459962,334.0,11226.0,10056.0,276.0,-1.0,55.0,120.0,122.0,2.421469,0.0,1
8,100.0,-0.51,-0.029316,52982501.0,11000000.0,279.6,620.0,14813.0,13463.0,335.0,917.0,58.0,124.0,234.0,4.816591,1.0,1
9,106.0,1.49,-0.024925,33622690.0,11000000.0,494.008,385.0,9134.0,8107.0,198.0,459.0,81.0,90.0,147.0,3.056608,1.0,0


In [31]:
# filter out binary variables 
# will concat with dfx later in the notebook 
dfbinary = dfx.filter(['C2', 'C3p'])
dfbinary.head()

Unnamed: 0,C2,C3p
0,1.0,0
1,0.0,1
2,1.0,1
3,1.0,1
4,1.0,0


In [32]:
dfx = dfx.drop(labels=['C2', 'C3p'], axis=1)
dfx.head()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,C5p
0,122.0,2.625,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,3.864345
1,254.375,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,7.882541
2,90.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,3.369134
3,209.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,3.299697
4,80.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,3.726269


In [33]:
# checking for null values 
print(dfx.isnull().sum(axis=0).tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [34]:
# exmaining the columns datatypes and number of records 
dfx.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 682 entries, 0 to 681
Data columns (total 15 columns):
C1     682 non-null float64
C3     682 non-null float64
C4     682 non-null float64
C5     682 non-null float64
C6     682 non-null float64
C7     682 non-null float64
T1     682 non-null float64
T2     682 non-null float64
T3     682 non-null float64
T4     682 non-null float64
T5     682 non-null float64
S1     682 non-null float64
S2     682 non-null float64
S3     682 non-null float64
C5p    682 non-null float64
dtypes: float64(15)
memory usage: 85.2 KB


In [35]:
# read in y variable csv because we will be making the original variables into ratio varialbes 
dfy = pd.read_csv('../csv_files/y.csv', index_col=0)
dfy.head()

Unnamed: 0,pIPO,pH,pL,p1day,Pmid,Y1,Y2,C6p
0,10.0,9.5,8.5,11.87,9.0,0,1,11.111111
1,8.0,10.0,8.0,7.25,9.0,1,0,0.0
2,7.0,14.0,12.0,6.7,13.0,1,0,0.0
3,11.5,16.0,14.0,12.39,15.0,1,1,0.0
4,21.0,21.0,19.0,56.599998,20.0,0,1,5.0


**Creating Target Variables**

In [36]:
# Define a pmid column to utilize later
def Pmid(row):
    return (row['pH'] + row['pL'])/2

# add a new column with the values returned from the previous function
dfy['Pmid'] = dfy.apply(lambda row: Pmid(row),axis=1)

In [37]:
# Define Pre-IPO Revision
def Y1Func(row):
    if row['pIPO'] < row['Pmid']:
        return 1
    return 0

# add a new column with the values returned from the previous function
dfy['Y1'] = dfy.apply(lambda row: Y1Func(row),axis=1)

In [38]:
# Define Post-IPO Initial Return
def Y2Func(row):
    if row['pIPO'] < row['p1day']:
        return 1
    return 0
# add a new column with the values returned from the previous function
dfy['Y2'] = dfy.apply(lambda row: Y2Func(row),axis=1)

**Creating Control Variables**

In [39]:
# Define Positive EPS Dummy
#def C3pFunc(row):
    #if row['C3'] > 0:
        #return 1
    #return 0
# add a new column with the values returned from the previous function
#dfx['C3p'] = dfx.apply(lambda row: C3pFunc(row),axis=1)

In [40]:
# Define Share Overhang
def C5pFunc(row):
    return row['C5']/row['C6']
    
# add a new column with the values returned from the previous function
dfx['C5p'] = dfx.apply(lambda row: C5pFunc(row),axis=1)

In [41]:
# Define Up Revision
def C6pFunc(row):
    if row['pIPO'] > row['Pmid']:
        return 100*(row['pIPO']-row['Pmid'])/row['Pmid']
    return 0
    
# add a new column with the values returned from the previous function
dfy['C6p'] = dfy.apply(lambda row: C6pFunc(row),axis=1)
dfy.head()

Unnamed: 0,pIPO,pH,pL,p1day,Pmid,Y1,Y2,C6p
0,10.0,9.5,8.5,11.87,9.0,0,1,11.111111
1,8.0,10.0,8.0,7.25,9.0,1,0,0.0
2,7.0,14.0,12.0,6.7,13.0,1,0,0.0
3,11.5,16.0,14.0,12.39,15.0,1,1,0.0
4,21.0,21.0,19.0,56.599998,20.0,0,1,5.0


**Creating Ratio Variables**

In [42]:
# Define Long Sentences
def LongSentences(row):
    return row['T4']/row['T1']
    
# add a new column with the values returned from the previous function
dfx['LongSentences'] = dfx.apply(lambda row: LongSentences(row),axis=1)

In [43]:
# Define Real Words
def RealWords(row):
    if row['T2'] == 0 :
        return 0
    else: 
        return row['T3']/row['T2']

# add a new column with the values returned from the previous function
dfx['RealWords'] = dfx.apply(lambda row: RealWords(row),axis=1)

In [44]:
# Define Long Words
def LongWords(row):
    if row['T2'] == 0 :
        return 0
    else: 
        return row['T5']/row['T2']
    
# add a new column with the values returned from the previous function
dfx['LongWords'] = dfx.apply(lambda row: LongWords(row),axis=1)

In [45]:
# Define Positive Words
def PositiveWords(row):
    if row['T2'] == 0 :
        return 0
    else: 
        return row['S1']/row['T2']
    
# add a new column with the values returned from the previous function
dfx['PositiveWords'] = dfx.apply(lambda row: PositiveWords(row),axis=1)

In [46]:
# Define Negative Words
def NegativeWords(row):
    if row['T2'] == 0 :
        return 0
    else: 
        return row['S2']/row['T2']
    
# add a new column with the values returned from the previous function
dfx['NegativeWords'] = dfx.apply(lambda row: NegativeWords(row),axis=1)

In [47]:
# Define Uncertain Words
def UncertainWords(row):
    if row['T2'] == 0 :
        return 0
    else: 
        return row['S3']/row['T2']
    
# add a new column with the values returned from the previous function
dfx['UncertainWords'] = dfx.apply(lambda row: UncertainWords(row),axis=1)

In [48]:
# gettinga sample of dfx to check that the columns were created 
dfx.sample(5)

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S1,S2,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords
465,165.0,-1.71,-0.012548,16529400.0,3850000.0,7.881,353.0,9198.0,8349.0,206.0,...,59.0,110.0,90.0,4.293349,0.583569,0.907697,0.05773,0.006414,0.011959,0.009785
564,147.0,-2.775,0.031816,95334370.0,22711750.0,1188.882656,772.0,24307.0,22126.75,564.5,...,138.0,244.875,282.5,4.197579,0.731218,0.910304,0.047023,0.005677,0.010074,0.011622
60,67.0,-0.02,0.012709,50082900.0,9090900.0,32.978,547.0,13486.0,12249.0,331.0,...,64.0,84.0,175.0,5.509125,0.605119,0.908275,0.052944,0.004746,0.006229,0.012976
602,151.0,-0.6,0.021873,93109390.0,13300000.0,111.943,778.0,24307.0,22126.75,564.5,...,89.0,224.0,231.0,7.000706,0.725578,0.910304,0.053189,0.003661,0.009215,0.009503
273,202.0,1.18,0.049163,27846580.0,6666667.0,89.839,368.0,9715.0,8919.0,239.0,...,95.0,74.0,106.0,4.176987,0.649457,0.918065,0.059599,0.009779,0.007617,0.010911


**Normalizing the Data**

In [49]:
# define the function that will normalize the data 
def normalize(col):
    mean = np.mean(col)
    low = min(col)
    high = max(col)
    colnorm = (col - mean) / (high - low)
    return colnorm

In [50]:
# apply the function to the appropriate columns 
columns = ['C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 
           'S1', 'S2', 'S3', 'C5p', 'LongSentences', 'RealWords', 'LongWords', 
           'PositiveWords', 'NegativeWords', 'UncertainWords']
dfx_norm = pd.DataFrame()
for column in columns:
    dfx_norm[column] = normalize(dfx[column])
dfx_norm.head()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S1,S2,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords
0,-0.020556,0.509084,0.125174,0.038593,0.04802,-0.210053,0.014349,0.006411,0.014487,0.018281,...,-0.028904,0.018569,-0.011149,-0.006655,0.008512,-0.002878,-0.003019,-0.003241,-0.002751,-0.003176
1,0.521132,-0.277027,-0.120508,-0.093366,-0.32157,-0.231426,0.461424,0.381669,0.37717,0.390167,...,0.033005,0.574433,0.370918,0.075532,0.011522,-0.002946,-0.003343,-0.005246,-0.001676,-0.003203
2,-0.151502,-0.206657,0.076767,-0.224568,-0.204383,-0.247037,-0.360303,-0.30201,-0.291704,-0.289549,...,-0.035782,-0.354972,-0.319141,-0.01164,0.006001,-0.002875,-0.002165,0.005091,-0.004402,-0.002934
3,0.335454,-0.145546,0.072761,-0.24636,-0.226918,-0.246071,-0.183423,-0.178054,-0.165657,-0.20236,...,0.094914,-0.239352,-0.124209,-0.012339,-0.061584,-0.002823,-0.00218,0.002727,-0.00391,-0.001954
4,-0.192423,0.035936,-0.245261,-0.072933,-0.057899,0.278632,0.15641,0.093722,0.087521,0.080558,...,0.00549,0.16087,0.098013,-0.008045,-0.028356,-0.003015,-0.003667,-0.003683,-0.002272,-0.003006


In [51]:
# import the preprocessing package from sklearn
from sklearn import preprocessing

# scale all columns to have a range of 0-1
x = dfx[columns].values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
dfx_scaled = pd.DataFrame(x_scaled,columns=columns)
dfx_scaled.head()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S1,S2,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords
0,0.458312,1.0,0.630418,0.406682,0.4541,0.043128,0.470752,0.520721,0.522445,0.531139,...,0.433362,0.431351,0.440546,0.034399,0.441786,0.006121,0.006176,0.006161,0.003974,0.002967
1,1.0,0.213889,0.384735,0.274723,0.08451,0.021755,0.917827,0.89598,0.885128,0.903025,...,0.495271,0.987215,0.822612,0.116586,0.444796,0.006053,0.005851,0.004156,0.005049,0.00294
2,0.327366,0.284259,0.582011,0.143521,0.201697,0.006144,0.0961,0.2123,0.216254,0.22331,...,0.426483,0.05781,0.132554,0.029414,0.439275,0.006125,0.00703,0.014493,0.002323,0.003209
3,0.814322,0.34537,0.578004,0.121729,0.179161,0.00711,0.272981,0.336256,0.342301,0.310498,...,0.55718,0.17343,0.327485,0.028715,0.37169,0.006177,0.007015,0.012129,0.002815,0.004189
4,0.286445,0.526852,0.259982,0.295157,0.348181,0.531813,0.612813,0.608032,0.595478,0.593416,...,0.467756,0.573652,0.549708,0.033009,0.404918,0.005984,0.005528,0.005718,0.004453,0.003137


In [52]:
# verify that the min and max values for each column are 0 and 1 respectively 
dfx_scaled.describe()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S1,S2,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,...,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0
mean,0.478868,0.490916,0.505244,0.368089,0.406079,0.253181,0.456403,0.51431,0.507958,0.512858,...,0.462266,0.412782,0.451694,0.041054,0.433274,0.008999,0.009195,0.009402,0.006725,0.006143
std,0.253209,0.261763,0.186123,0.282768,0.27308,0.299389,0.212153,0.195645,0.191904,0.192641,...,0.211455,0.246954,0.213469,0.059791,0.057621,0.052583,0.054156,0.047612,0.053366,0.050622
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.306905,0.375,0.384735,0.164267,0.201885,0.034856,0.305362,0.375,0.369146,0.375,...,0.316423,0.235686,0.288499,0.025201,0.400329,0.005954,0.005407,0.005268,0.002756,0.00228
50%,0.409207,0.526852,0.504223,0.260783,0.314377,0.113276,0.435237,0.492927,0.4876,0.492883,...,0.419604,0.357977,0.421053,0.034367,0.434831,0.00605,0.005914,0.006405,0.003644,0.003193
75%,0.584143,0.625,0.641226,0.49856,0.521131,0.420914,0.583217,0.625,0.621488,0.625,...,0.589854,0.541412,0.573099,0.042698,0.464502,0.006131,0.006401,0.007766,0.004635,0.003944
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
# concat scaled dataframe with binary dataframe 
dfx = pd.concat([dfx_scaled, dfbinary], axis=1)
dfx.head()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords,C2,C3p
0,0.458312,1.0,0.630418,0.406682,0.4541,0.043128,0.470752,0.520721,0.522445,0.531139,...,0.440546,0.034399,0.441786,0.006121,0.006176,0.006161,0.003974,0.002967,1.0,0
1,1.0,0.213889,0.384735,0.274723,0.08451,0.021755,0.917827,0.89598,0.885128,0.903025,...,0.822612,0.116586,0.444796,0.006053,0.005851,0.004156,0.005049,0.00294,0.0,1
2,0.327366,0.284259,0.582011,0.143521,0.201697,0.006144,0.0961,0.2123,0.216254,0.22331,...,0.132554,0.029414,0.439275,0.006125,0.00703,0.014493,0.002323,0.003209,1.0,1
3,0.814322,0.34537,0.578004,0.121729,0.179161,0.00711,0.272981,0.336256,0.342301,0.310498,...,0.327485,0.028715,0.37169,0.006177,0.007015,0.012129,0.002815,0.004189,1.0,1
4,0.286445,0.526852,0.259982,0.295157,0.348181,0.531813,0.612813,0.608032,0.595478,0.593416,...,0.549708,0.033009,0.404918,0.005984,0.005528,0.005718,0.004453,0.003137,1.0,0


In [54]:
# verify that new dfx dataframe has the scaled variables 
dfx.describe()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S3,C5p,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords,C2,C3p
count,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,...,682.0,682.0,682.0,682.0,682.0,682.0,682.0,682.0,660.0,682.0
mean,0.478868,0.490916,0.505244,0.368089,0.406079,0.253181,0.456403,0.51431,0.507958,0.512858,...,0.451694,0.041054,0.433274,0.008999,0.009195,0.009402,0.006725,0.006143,0.859091,0.463343
std,0.253209,0.261763,0.186123,0.282768,0.27308,0.299389,0.212153,0.195645,0.191904,0.192641,...,0.213469,0.059791,0.057621,0.052583,0.054156,0.047612,0.053366,0.050622,0.348192,0.49902
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.306905,0.375,0.384735,0.164267,0.201885,0.034856,0.305362,0.375,0.369146,0.375,...,0.288499,0.025201,0.400329,0.005954,0.005407,0.005268,0.002756,0.00228,1.0,0.0
50%,0.409207,0.526852,0.504223,0.260783,0.314377,0.113276,0.435237,0.492927,0.4876,0.492883,...,0.421053,0.034367,0.434831,0.00605,0.005914,0.006405,0.003644,0.003193,1.0,0.0
75%,0.584143,0.625,0.641226,0.49856,0.521131,0.420914,0.583217,0.625,0.621488,0.625,...,0.573099,0.042698,0.464502,0.006131,0.006401,0.007766,0.004635,0.003944,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [55]:
dfx.to_csv('../csv_files/p1minmax.csv')