In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [2]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)
df.head(5)

Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0
ABPI,ACCENTIA BIOPHARMACEUTICALS INC,2834,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0
ACAD,ACADIA PHARMACEUTICALS INC,2834,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0
ACHN,ACHILLION PHARMACEUTICALS INC,2834,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0
ACLI,AMERICAN COMMERCIAL LINES INC.,4492,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0


## Changing DataTypes

In [3]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df['C2'] = df['C2'].astype('bool') 
print(df.dtypes)

I2          object
I3         float64
P(IPO)     float64
P(H)       float64
P(L)       float64
P(1Day)    float64
C1         float64
C2            bool
C3         float64
C4         float64
C5         float64
C6         float64
C7         float64
T1         float64
T2         float64
T3         float64
T4         float64
T5         float64
S1         float64
S2         float64
S3         float64
dtype: object


## Cleanining/Imputing

For our predictor variables only

In [4]:
df = df.dropna(subset=['P(1Day)'])
print(df.isna().sum())

## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)
df.head(1)

I2          0
I3          4
P(IPO)      0
P(H)        0
P(L)        0
P(1Day)     0
C1          0
C2          0
C3         24
C4          0
C5          0
C6          0
C7         60
T1          0
T2          0
T3          0
T4          0
T5          0
S1          0
S2          0
S3          0
dtype: int64


Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674.0,10.0,9.5,8.5,11.87,122.0,True,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111


## Now we need to change our indusrt columns

In [5]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)

df.head(1)

Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674.0,10.0,9.5,8.5,11.87,122.0,True,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0


## Now we need to create our ratio columns


In [6]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
print(t2_mean)

df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

df.head(3)

12849.70303030303


Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674.0,10.0,9.5,8.5,11.87,122.0,True,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,ACCENTIA BIOPHARMACEUTICALS INC,2834.0,8.0,10.0,8.0,7.25,259.0,False,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,ACADIA PHARMACEUTICALS INC,2834.0,7.0,14.0,12.0,6.7,90.0,True,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764


## No longer need these columns

In [7]:
df = df.drop(columns = ['I2', 'I3'])
df.head(1)

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,True,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425


## Getting rid of our 0 value columns

In [8]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [9]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Creyting More Variables for us to use


## Normalize the Rest of the DF

In [10]:
df['market_cap'] =df['P(IPO)'] * df['C6']
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,14.034621,15.610394,13.635394,25.934766,149.728788,-0.218433,0.007282,49962850.0,12582480.0,4.592588,468.701515,12888.641524,11515.500275,295.957576,669.996478,68.781327,120.89697,145.516667,14.622894,0.5,0.698485,0.477273,4.417255,5.404645,2.048485,0.448485,0.30303,0.087879,0.072727,0.087879,0.005308,0.009055,0.011516,0.891504,0.62672,0.052171,231880400.0
std,5.954214,6.613816,5.790112,73.234948,152.817467,1.534849,0.033318,105537700.0,25399400.0,1.91664,176.37443,5415.681452,4805.168278,122.172959,298.043412,39.31667,85.542267,69.824545,6.187137,0.500379,0.459264,0.499862,5.133537,10.877569,1.272118,0.497716,0.459917,0.283333,0.259885,0.283333,0.001755,0.00299,0.002842,0.071672,0.077686,0.007347,963720100.0
min,5.0,0.0,5.0,0.0,10.0,-9.229642,-0.162352,3693227.0,800000.0,-2.60369,132.0,3271.0,3003.0,0.0,68.675758,9.0,20.0,40.0,4.0,0.0,0.0,0.0,0.283223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.3e-05,0.002152,0.005326,0.0,0.0,-8.9e-05,5775000.0
25%,10.0,13.0,11.0,11.0,85.0,-0.926368,-0.013927,18775680.0,5124546.0,3.724681,356.0,9275.5,8297.0,214.0,463.75,45.0,74.0,100.0,12.0,0.0,0.0,0.0,2.850622,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.004131,0.006966,0.009579,0.883417,0.580329,0.047544,60000000.0
50%,13.75,15.5,13.0,14.845,107.0,-0.218433,0.009125,27454380.0,7500000.0,4.592588,446.0,12141.5,10853.0,280.0,627.0,60.0,101.5,135.0,14.25,0.5,1.0,0.0,3.731035,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.005027,0.008577,0.011301,0.897294,0.628861,0.051927,98700000.0
75%,17.0,17.0,15.0,20.485,155.25,0.738329,0.031571,50039860.0,12000000.0,5.706326,553.25,15275.0,13793.5,355.25,801.25,85.0,144.0,174.0,16.0,1.0,1.0,1.0,4.857904,7.692308,2.0,1.0,1.0,0.0,0.0,0.0,0.006254,0.010521,0.012848,0.909693,0.674461,0.056489,176000000.0
max,85.0,135.0,108.0,1159.200562,2087.0,15.692704,0.092896,2138085000.0,421233600.0,10.331464,1750.0,49056.0,43952.0,1058.0,2729.0,309.0,944.0,883.0,121.5,1.0,1.0,1.0,99.787255,100.0,5.0,1.0,1.0,1.0,1.0,1.0,0.012839,0.024605,0.029397,1.49054,0.826347,0.088329,17864000000.0


In [11]:
df['market_cap'].skew()

16.124656833368746

In [12]:
pos_skew = ['P(IPO)', 'P(L)', 'C1', 'C5', 'C6', 'T1', 'T2', 'T3', 'T5', 'S1', 'S2', 'S3', 'P(Mid)', 'C5_Prime', 
            'S2_Prime','S3_Prime', 'S1_Prime', 'market_cap' ]

for col in pos_skew:
    df[col]=np.log(df[col])

cbrt_skew = ['P(1Day)', 'T4', 'C6_Prime', 'P(H)']

for col in cbrt_skew:
    df[col]=np.cbrt(df[col])

over_log = ['C5', 'C6']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])

neg_skew = ['T3_Prime', 'T4_Prime']

for col in neg_skew:
    df[col]=df[col]**2


  """


In [13]:
df.to_csv('PythonExport.csv', sep=',')
df.dtypes

P(IPO)        float64
P(H)          float64
P(L)          float64
P(1Day)       float64
C1            float64
C2               bool
C3            float64
C4            float64
C5            float64
C6            float64
C7            float64
T1            float64
T2            float64
T3            float64
T4            float64
T5            float64
S1            float64
S2            float64
S3            float64
P(Mid)        float64
Y1              int64
Y2              int64
C3_Prime        int64
C5_Prime      float64
C6_Prime      float64
industry      float64
industry_1      int64
industry_2      int64
industry_3      int64
industry_4      int64
industry_5      int64
S1_Prime      float64
S2_Prime      float64
S3_Prime      float64
T3_Prime      float64
T4_Prime      float64
T5_Prime      float64
market_cap    float64
dtype: object

In [14]:
cols_all = ['P(IPO)','P(H)','P(L)',	'P(1Day)',	'C1','C3','C4','C5',	'C6',	'C7',	'T1',	'T2',	'T3',	'T4',	'T5',	
        'S1',	'S2',	'S3',	'P(Mid)',	'C3_Prime',	'C5_Prime',	'C6_Prime',	'industry',	'S1_Prime',	
        'S2_Prime',	'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

cols_test = ['P(IPO)', 'P(H)', 'P(L)', 'C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'P(Mid)',	'C3_Prime',	'C5_Prime',	'C6_Prime',	'industry', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']
dataset = df[cols_test].copy()
target_y1 = df[['Y1']].copy()
target_y2 = df[['Y2']].copy()

#s1Prime 

In [15]:

# load the iris datasets
dataset = dataset.values
target = target_y1.values
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[ True  True  True False False False False False False False False False
 False False False False  True False  True False False  True False False
 False False False False False False False False False]
[ 1  1  1 20 24  7 18 26 28 13 27 19  9 14 23  5  1 17  1  6 10  1 21  4
  2  3 15 22 25 11 12  8 16]


In [18]:
cols_keep = ['P(IPO)', 'P(H)', 'P(L)', 'S2', 'P(Mid)', 'C6_Prime', 'Y1', 'Y2']
df_output = df.copy()
df_output = df_output[cols_keep]

In [19]:
df_output.to_csv('datasets/group1.csv', sep=',')

In [20]:
'''
for i in range(1, 33):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(dataset,  target_y1)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)
'''

"\nfor i in range(1, 33):\n    print(i)\n    # create a base classifier used to evaluate a subset of attributes\n    model = LogisticRegression()\n    # create the RFE model and select 3 attributes\n    rfe = RFE(model, i)\n    rfe = rfe.fit(dataset,  target_y1)\n    # summarize the selection of the attributes\n    print('Model with the best', i, 'features')\n    print(rfe.support_)\n    print(rfe.ranking_)\n"

## This Notebooks Data Yields a Result of 

- Y1: F1 0.8721; AUC 0.8910
- Y2: F1 0.6369; AUC 0.6271


**note we only tested the variables for y1, not y2