In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [22]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)

## Changing DataTypes

In [23]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df['C2'] = df['C2'].astype('bool') 

## Cleanining/Imputing

For our predictor variables only

In [24]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

## Now we need to change our indusrt columns

In [25]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [26]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [27]:
df = df.drop(columns = ['I2', 'I3'])

## Getting rid of our 0 value columns

In [28]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [29]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Creyting More Variables for us to use


## Normalize the Rest of the DF

In [30]:
df['market_cap'] =df['P(IPO)'] * df['C6']
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,14.034621,15.610394,13.635394,25.934766,149.728788,-0.218433,0.007282,49962850.0,12582480.0,4.592588,468.701515,12888.641524,11515.500275,295.957576,669.996478,68.781327,120.89697,145.516667,14.622894,0.5,0.698485,0.477273,4.417255,5.404645,2.048485,0.448485,0.30303,0.087879,0.072727,0.087879,0.005308,0.009055,0.011516,0.891504,0.62672,0.052171,231880400.0
std,5.954214,6.613816,5.790112,73.234948,152.817467,1.534849,0.033318,105537700.0,25399400.0,1.91664,176.37443,5415.681452,4805.168278,122.172959,298.043412,39.31667,85.542267,69.824545,6.187137,0.500379,0.459264,0.499862,5.133537,10.877569,1.272118,0.497716,0.459917,0.283333,0.259885,0.283333,0.001755,0.00299,0.002842,0.071672,0.077686,0.007347,963720100.0
min,5.0,0.0,5.0,0.0,10.0,-9.229642,-0.162352,3693227.0,800000.0,-2.60369,132.0,3271.0,3003.0,0.0,68.675758,9.0,20.0,40.0,4.0,0.0,0.0,0.0,0.283223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.3e-05,0.002152,0.005326,0.0,0.0,-8.9e-05,5775000.0
25%,10.0,13.0,11.0,11.0,85.0,-0.926368,-0.013927,18775680.0,5124546.0,3.724681,356.0,9275.5,8297.0,214.0,463.75,45.0,74.0,100.0,12.0,0.0,0.0,0.0,2.850622,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.004131,0.006966,0.009579,0.883417,0.580329,0.047544,60000000.0
50%,13.75,15.5,13.0,14.845,107.0,-0.218433,0.009125,27454380.0,7500000.0,4.592588,446.0,12141.5,10853.0,280.0,627.0,60.0,101.5,135.0,14.25,0.5,1.0,0.0,3.731035,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.005027,0.008577,0.011301,0.897294,0.628861,0.051927,98700000.0
75%,17.0,17.0,15.0,20.485,155.25,0.738329,0.031571,50039860.0,12000000.0,5.706326,553.25,15275.0,13793.5,355.25,801.25,85.0,144.0,174.0,16.0,1.0,1.0,1.0,4.857904,7.692308,2.0,1.0,1.0,0.0,0.0,0.0,0.006254,0.010521,0.012848,0.909693,0.674461,0.056489,176000000.0
max,85.0,135.0,108.0,1159.200562,2087.0,15.692704,0.092896,2138085000.0,421233600.0,10.331464,1750.0,49056.0,43952.0,1058.0,2729.0,309.0,944.0,883.0,121.5,1.0,1.0,1.0,99.787255,100.0,5.0,1.0,1.0,1.0,1.0,1.0,0.012839,0.024605,0.029397,1.49054,0.826347,0.088329,17864000000.0


In [31]:
pos_skew = ['P(IPO)', 'P(L)', 'C1', 'C5', 'C6', 'T1', 'T2', 'T3', 'T5', 'S1', 'S2', 'S3', 'P(Mid)', 'C5_Prime', 
            'S2_Prime','S3_Prime', 'S1_Prime', 'market_cap' ]

for col in pos_skew:
    df[col]=np.log(df[col])

cbrt_skew = ['P(1Day)', 'T4', 'C6_Prime', 'P(H)']

for col in cbrt_skew:
    df[col]=np.cbrt(df[col])

over_log = ['C5', 'C6']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])

neg_skew = ['T3_Prime', 'T4_Prime']

for col in neg_skew:
    df[col]=df[col]**2
df.head(1)

  """


Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
AATI,2.302585,2.117912,2.140066,2.281131,4.804021,True,1.508104,0.029074,0.349185,0.359253,3.938568,6.152733,9.450852,9.355306,2.588776,6.536692,4.127134,4.762174,4.934474,2.197225,0,1,1,1.351792,2.231443,1.0,1,0,0,0,0,-5.323718,-4.688678,-4.516378,0.826056,0.410145,0.05425,18.47895


In [32]:
cols_test = ['P(IPO)', 'P(H)', 'P(L)', 'P(1Day)','C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'P(Mid)',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5', 'market_cap']
dataset = df[cols_test].copy()
target_y1 = df[['Y1']].copy()
target_y2 = df[['Y2']].copy()

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [33]:
dataset = dataset.values
target_y1 = target_y1.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True False  True False False False False False False False False False
 False False False False False False False False False False  True False
 False False False False False False False False False False]
[ 1  3  1 28 24 23 10 22 32 31 15 27 20 21 17 29  8  4 19  2  9 18  1  7
  5  6 16 30 13 25 26 11 12 14]


  y = column_or_1d(y, warn=True)


In [34]:
cols_keep_y1 = ['P(IPO)', 'P(H)', 'P(L)', 'S2', 'P(Mid)', 'C6_Prime', 'Y1', 'Y2']
df_output = df.copy()
df_output = df_output[cols_keep_y1]

## Y2

In [35]:
target_y2 = target_y2.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset, target_y2)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[ True False  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False]
[ 1 13  1  1 26 12 24  4 29 28 21 16 25  8  7  9 15 10 23  3 11 22 14 20
 27  2 32 30 18 17  5 31  6 19]


## Collumns we want to keep

In [36]:
df_output = df.copy()
cols_keep = ['P(IPO)', 'P(L)', 'C6_Prime', 'P(1Day)', 'Y1', 'Y2']
df_output = df.copy()
df_output = df_output[cols_keep]
df_output.head()

Unnamed: 0_level_0,P(IPO),P(L),C6_Prime,P(1Day),Y1,Y2
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AATI,2.302585,2.140066,2.231443,2.281131,0,1
ABPI,2.079442,2.079442,0.0,1.935438,1,0
ACAD,1.94591,2.484907,0.0,1.885204,1,0
ACHN,2.442347,2.639057,0.0,2.313967,1,1
ACLI,3.044522,2.944439,1.709976,3.839478,0,1


## Write to Datafile

In [37]:
df_output.to_csv('datasets/group2.csv', sep=',')