In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [50]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)

## Changing DataTypes

In [51]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df['C2'] = df['C2'].astype('bool') 

## Cleanining/Imputing

For our predictor variables only

In [52]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

## Now we need to change our indusrt columns

In [53]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [54]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [55]:
df = df.drop(columns = ['I2', 'I3'])

## Getting rid of our 0 value columns

In [56]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [57]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Creyting More Variables for us to use


## Normalize the Rest of the DF

In [58]:
df['market_cap'] =df['P(IPO)'] * df['C6']
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,14.034621,15.610394,13.635394,25.934766,149.728788,-0.218433,0.007282,49962850.0,12582480.0,4.592588,468.701515,12888.641524,11515.500275,295.957576,669.996478,68.781327,120.89697,145.516667,14.622894,0.5,0.698485,0.477273,4.417255,5.404645,2.048485,0.448485,0.30303,0.087879,0.072727,0.087879,0.005308,0.009055,0.011516,0.891504,0.62672,0.052171,231880400.0
std,5.954214,6.613816,5.790112,73.234948,152.817467,1.534849,0.033318,105537700.0,25399400.0,1.91664,176.37443,5415.681452,4805.168278,122.172959,298.043412,39.31667,85.542267,69.824545,6.187137,0.500379,0.459264,0.499862,5.133537,10.877569,1.272118,0.497716,0.459917,0.283333,0.259885,0.283333,0.001755,0.00299,0.002842,0.071672,0.077686,0.007347,963720100.0
min,5.0,0.0,5.0,0.0,10.0,-9.229642,-0.162352,3693227.0,800000.0,-2.60369,132.0,3271.0,3003.0,0.0,68.675758,9.0,20.0,40.0,4.0,0.0,0.0,0.0,0.283223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.3e-05,0.002152,0.005326,0.0,0.0,-8.9e-05,5775000.0
25%,10.0,13.0,11.0,11.0,85.0,-0.926368,-0.013927,18775680.0,5124546.0,3.724681,356.0,9275.5,8297.0,214.0,463.75,45.0,74.0,100.0,12.0,0.0,0.0,0.0,2.850622,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.004131,0.006966,0.009579,0.883417,0.580329,0.047544,60000000.0
50%,13.75,15.5,13.0,14.845,107.0,-0.218433,0.009125,27454380.0,7500000.0,4.592588,446.0,12141.5,10853.0,280.0,627.0,60.0,101.5,135.0,14.25,0.5,1.0,0.0,3.731035,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.005027,0.008577,0.011301,0.897294,0.628861,0.051927,98700000.0
75%,17.0,17.0,15.0,20.485,155.25,0.738329,0.031571,50039860.0,12000000.0,5.706326,553.25,15275.0,13793.5,355.25,801.25,85.0,144.0,174.0,16.0,1.0,1.0,1.0,4.857904,7.692308,2.0,1.0,1.0,0.0,0.0,0.0,0.006254,0.010521,0.012848,0.909693,0.674461,0.056489,176000000.0
max,85.0,135.0,108.0,1159.200562,2087.0,15.692704,0.092896,2138085000.0,421233600.0,10.331464,1750.0,49056.0,43952.0,1058.0,2729.0,309.0,944.0,883.0,121.5,1.0,1.0,1.0,99.787255,100.0,5.0,1.0,1.0,1.0,1.0,1.0,0.012839,0.024605,0.029397,1.49054,0.826347,0.088329,17864000000.0


In [59]:
pos_skew = ['P(IPO)', 'P(L)', 'C1', 'C5', 'C6', 'T1', 'T2', 'T3', 'T5', 'S1', 'S2', 'S3', 'P(Mid)', 'C5_Prime', 
            'S2_Prime','S3_Prime', 'S1_Prime', 'market_cap' ]

for col in pos_skew:
    df[col]=np.log(df[col])

cbrt_skew = ['P(1Day)', 'T4', 'C6_Prime', 'P(H)']

for col in cbrt_skew:
    df[col]=np.cbrt(df[col])

over_log = ['C5', 'C6']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])

neg_skew = ['T3_Prime', 'T4_Prime']

for col in neg_skew:
    df[col]=df[col]**2


  """


In [60]:
cols_test = ['P(IPO)', 'P(H)', 'P(L)', 'C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'P(Mid)',	'C3_Prime',	'C5_Prime',	'C6_Prime',	'industry', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']
dataset = df[cols_test].copy()
target_y1 = df[['Y1']].copy()
target_y2 = df[['Y2']].copy()

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [61]:
# load the iris datasets
dataset = dataset.values
target_y1 = target_y1.values
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
for i in range(3, 11):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(dataset,  target_y1)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)

3


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 3 features
[ True False  True False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False]
[ 1  3  1 23 27 10 21 29 31 16 30 22 12 17 26  8  4 20  2  9 13  1 24  7
  5  6 18 25 28 14 15 11 19]
4
Model with the best 4 features
[ True False  True False False False False False False False False False
 False False False False False False  True False False  True False False
 False False False False False False False False False]
[ 1  2  1 22 26  9 20 28 30 15 29 21 11 16 25  7  3 19  1  8 12  1 23  6
  4  5 17 24 27 13 14 10 18]
5
Model with the best 5 features
[ True  True  True False False False False False False False False False
 False False False False False False  True False False  True False False
 False False False False False False False False False]
[ 1  1  1 21 25  8 19 27 29 14 28 20 10 15 24  6  2 18  1  7 11  1 22  5
  3  4 16 23 26 12 1

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 6 features
[ True  True  True False False False False False False False False False
 False False False False  True False  True False False  True False False
 False False False False False False False False False]
[ 1  1  1 20 24  7 18 26 28 13 27 19  9 14 23  5  1 17  1  6 10  1 21  4
  2  3 15 22 25 11 12  8 16]
7
Model with the best 7 features
[ True  True  True False False False False False False False False False
 False False False False  True False  True False False  True False False
  True False False False False False False False False]
[ 1  1  1 19 23  6 17 25 27 12 26 18  8 13 22  4  1 16  1  5  9  1 20  3
  1  2 14 21 24 10 11  7 15]
8


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 8 features
[ True  True  True False False False False False False False False False
 False False False False  True False  True False False  True False False
  True  True False False False False False False False]
[ 1  1  1 18 22  5 16 24 26 11 25 17  7 12 21  3  1 15  1  4  8  1 19  2
  1  1 13 20 23  9 10  6 14]
9
Model with the best 9 features
[ True  True  True False False False False False False False False False
 False False False False  True False  True False False  True False  True
  True  True False False False False False False False]
[ 1  1  1 17 21  4 15 23 25 10 24 16  6 11 20  2  1 14  1  3  7  1 18  1
  1  1 12 19 22  8  9  5 13]
10


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 10 features
[ True  True  True False False False False False False False False False
 False False False  True  True False  True False False  True False  True
  True  True False False False False False False False]
[ 1  1  1 16 20  3 14 22 24  9 23 15  5 10 19  1  1 13  1  2  6  1 17  1
  1  1 11 18 21  7  8  4 12]


In [62]:
cols_keep_y1 = ['P(IPO)', 'P(H)', 'P(L)', 'S2', 'P(Mid)', 'C6_Prime', 'Y1', 'Y2']
df_output = df.copy()
df_output = df_output[cols_keep_y1]

## Y2

In [63]:
# load the iris datasets
target_y2 = target_y2.values
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
for i in range(3, 11):
    print(i)
    # create a base classifier used to evaluate a subset of attributes
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, i)
    rfe = rfe.fit(dataset,  target_y2)
    # summarize the selection of the attributes
    print('Model with the best', i, 'features')
    print(rfe.support_)
    print(rfe.ranking_)

3
Model with the best 3 features
[False  True  True False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False]
[30  1  1 27  3 15  2 24 25 22  8 18 23  7  9 10 17 16 13 14 11  1 20  5
 31  4 19 29 21 12  6 26 28]
4


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 4 features
[False  True  True False False False  True False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False]
[29  1  1 26  2 14  1 23 24 21  7 17 22  6  8  9 16 15 12 13 10  1 19  4
 30  3 18 28 20 11  5 25 27]
5
Model with the best 5 features
[False  True  True False  True False  True False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False]
[28  1  1 25  1 13  1 22 23 20  6 16 21  5  7  8 15 14 11 12  9  1 18  3
 29  2 17 27 19 10  4 24 26]
6


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 6 features
[False  True  True False  True False  True False False False False False
 False False False False False False False False False  True False False
 False  True False False False False False False False]
[27  1  1 24  1 12  1 21 22 19  5 15 20  4  6  7 14 13 10 11  8  1 17  2
 28  1 16 26 18  9  3 23 25]
7
Model with the best 7 features
[False  True  True False  True False  True False False False False False
 False False False False False False False False False  True False  True
 False  True False False False False False False False]
[26  1  1 23  1 11  1 20 21 18  4 14 19  3  5  6 13 12  9 10  7  1 16  1
 27  1 15 25 17  8  2 22 24]
8


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 8 features
[False  True  True False  True False  True False False False False False
 False False False False False False False False False  True False  True
 False  True False False False False  True False False]
[25  1  1 22  1 10  1 19 20 17  3 13 18  2  4  5 12 11  8  9  6  1 15  1
 26  1 14 24 16  7  1 21 23]
9
Model with the best 9 features
[False  True  True False  True False  True False False False False False
 False  True False False False False False False False  True False  True
 False  True False False False False  True False False]
[24  1  1 21  1  9  1 18 19 16  2 12 17  1  3  4 11 10  7  8  5  1 14  1
 25  1 13 23 15  6  1 20 22]
10


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Model with the best 10 features
[False  True  True False  True False  True False False False  True False
 False  True False False False False False False False  True False  True
 False  True False False False False  True False False]
[23  1  1 20  1  8  1 17 18 15  1 11 16  1  2  3 10  9  6  7  4  1 13  1
 24  1 12 22 14  5  1 19 21]


## Write to Datafile

In [64]:
df_output.to_csv('datasets/group2.csv', sep=',')