In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [2]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)

## Changing DataTypes

In [3]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df['C2'] = df['C2'].astype('bool') 

## Cleanining/Imputing

For our predictor variables only

In [4]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

## Now we need to change our industry columns

In [5]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [6]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [7]:
df = df.drop(columns = ['I2', 'I3'])

## Getting rid of our 0 value columns

In [8]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [9]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Creyting More Variables for us to use


## Normalize the Rest of the DF

In [10]:
df['market_cap'] =df['P(1Day)'] * df['C6']
df['market_cap'] = np.where(df['market_cap'] ==0, 0.001, df['market_cap'])
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,14.034621,15.610394,13.635394,25.934766,149.728788,-0.218433,0.007282,49962850.0,12582480.0,4.592588,468.701515,12888.641524,11515.500275,295.957576,669.996478,68.781327,120.89697,145.516667,14.622894,0.5,0.698485,0.477273,4.417255,5.404645,2.048485,0.448485,0.30303,0.087879,0.072727,0.087879,0.005308,0.009055,0.011516,0.891504,0.62672,0.052171,305089500.0
std,5.954214,6.613816,5.790112,73.234948,152.817467,1.534849,0.033318,105537700.0,25399400.0,1.91664,176.37443,5415.681452,4805.168278,122.172959,298.043412,39.31667,85.542267,69.824545,6.187137,0.500379,0.459264,0.499862,5.133537,10.877569,1.272118,0.497716,0.459917,0.283333,0.259885,0.283333,0.001755,0.00299,0.002842,0.071672,0.077686,0.007347,919043000.0
min,5.0,0.0,5.0,0.0,10.0,-9.229642,-0.162352,3693227.0,800000.0,-2.60369,132.0,3271.0,3003.0,0.0,68.675758,9.0,20.0,40.0,4.0,0.0,0.0,0.0,0.283223,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.3e-05,0.002152,0.005326,0.0,0.0,-8.9e-05,0.001
25%,10.0,13.0,11.0,11.0,85.0,-0.926368,-0.013927,18775680.0,5124546.0,3.724681,356.0,9275.5,8297.0,214.0,463.75,45.0,74.0,100.0,12.0,0.0,0.0,0.0,2.850622,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.004131,0.006966,0.009579,0.883417,0.580329,0.047544,65662560.0
50%,13.75,15.5,13.0,14.845,107.0,-0.218433,0.009125,27454380.0,7500000.0,4.592588,446.0,12141.5,10853.0,280.0,627.0,60.0,101.5,135.0,14.25,0.5,1.0,0.0,3.731035,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.005027,0.008577,0.011301,0.897294,0.628861,0.051927,122958800.0
75%,17.0,17.0,15.0,20.485,155.25,0.738329,0.031571,50039860.0,12000000.0,5.706326,553.25,15275.0,13793.5,355.25,801.25,85.0,144.0,174.0,16.0,1.0,1.0,1.0,4.857904,7.692308,2.0,1.0,1.0,0.0,0.0,0.0,0.006254,0.010521,0.012848,0.909693,0.674461,0.056489,240187500.0
max,85.0,135.0,108.0,1159.200562,2087.0,15.692704,0.092896,2138085000.0,421233600.0,10.331464,1750.0,49056.0,43952.0,1058.0,2729.0,309.0,944.0,883.0,121.5,1.0,1.0,1.0,99.787255,100.0,5.0,1.0,1.0,1.0,1.0,1.0,0.012839,0.024605,0.029397,1.49054,0.826347,0.088329,16104520000.0


In [11]:
pos_skew = ['P(IPO)', 'P(L)', 'C1', 'C5', 'C6', 'T1', 'T2', 'T3', 'T5', 'S1', 'S2', 'S3', 'P(Mid)', 'C5_Prime', 
            'S2_Prime','S3_Prime', 'S1_Prime', 'market_cap' ]

for col in pos_skew:
    df[col]=np.log(df[col])

cbrt_skew = ['P(1Day)', 'T4', 'C6_Prime', 'P(H)']

for col in cbrt_skew:
    df[col]=np.cbrt(df[col])

over_log = ['C5', 'C6']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])

neg_skew = ['T3_Prime', 'T4_Prime']

for col in neg_skew:
    df[col]=df[col]**2
    

df['industry_1'] = df['industry_1'].astype('category')
df['industry_2'] = df['industry_2'].astype('category')
df.describe()

  """


Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,659.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,2.570174,2.468196,2.556056,2.595374,4.757714,-0.218433,0.007282,0.35126,0.361369,4.592588,6.085846,9.384331,9.272666,2.547272,6.420257,4.095896,4.629703,4.892455,2.629442,0.5,0.698485,0.477273,1.321676,0.872089,2.048485,0.087879,0.072727,0.087879,-5.28879,-4.754628,-4.491876,0.799909,0.398804,0.052171,18.56766
std,0.374586,0.276076,0.324272,0.869526,0.670366,1.534849,0.033318,0.005743,0.006058,1.91664,0.359248,0.401475,0.398817,0.219957,0.418741,0.522101,0.555689,0.410897,0.312893,0.500379,0.459264,0.499862,0.508141,1.158446,1.272118,0.283333,0.259885,0.283333,0.324152,0.315494,0.233634,0.099443,0.090072,0.007347,2.275603
min,1.609438,0.0,1.609438,0.0,2.302585,-9.229642,-0.162352,0.326023,0.3346,-2.60369,4.882802,8.092851,8.007367,0.0,4.229396,2.197225,2.995732,3.688879,1.386294,0.0,0.0,0.0,-1.261521,0.0,1.0,0.0,0.0,0.0,-6.436265,-6.141157,-5.235159,0.0,0.0,-8.9e-05,-6.907755
25%,2.302585,2.351335,2.397895,2.22398,4.442651,-0.926368,-0.013927,0.347806,0.35827,3.724681,5.874931,9.135132,9.023647,2.445695,6.139345,3.806662,4.304065,4.60517,2.484907,0.0,0.0,0.0,1.047537,0.0,1.0,0.0,0.0,0.0,-5.48849,-4.966767,-4.648159,0.780425,0.336782,0.047544,18.000029
50%,2.620874,2.493315,2.564949,2.457688,4.672829,-0.218433,0.009125,0.352024,0.362065,4.592588,6.100319,9.404384,9.292197,2.557759,6.440947,4.094345,4.620047,4.905275,2.656603,0.5,1.0,0.0,1.316685,0.0,2.0,0.0,0.0,0.0,-5.291951,-4.758642,-4.482895,0.805136,0.395466,0.051927,18.627359
75%,2.833213,2.571282,2.70805,2.736184,5.045033,0.738329,0.031571,0.354826,0.365286,5.706326,6.31581,9.633973,9.531953,2.66127,6.686169,4.442651,4.969813,5.159055,2.772589,1.0,1.0,1.0,1.580607,1.974023,2.0,0.0,0.0,0.0,-5.074419,-4.554401,-4.354551,0.827541,0.454897,0.056489,19.29693
max,4.442651,5.129928,4.682131,10.504761,7.643483,15.692704,0.092896,0.368168,0.383214,10.331464,7.467371,10.800718,10.690853,3.192133,7.911691,5.733341,6.850126,6.783325,4.799914,1.0,1.0,1.0,4.60304,4.641589,5.0,1.0,1.0,1.0,-4.355265,-3.704811,-3.526857,2.22171,0.68285,0.088329,23.502366


## Standradize the data

In [12]:
cols = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime', 'market_cap']

for col in cols:
    mean = df[col].mean()
    std_dev = df[col].std()
    df[col] = (df[col]-mean)/std_dev

df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,market_cap
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,659.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,2.570174,2.468196,2.556056,2.595374,8.31658e-16,-4.84461e-17,3.22974e-17,1.480297e-17,2.384624e-15,7.320743e-16,-8.074349000000001e-17,2.451238e-15,-4.272004e-15,7.186171e-16,1.481643e-15,1.507212e-15,-7.589888e-16,4.669665e-16,-5.113755000000001e-17,2.629442,0.5,0.698485,-5.652044e-17,3.122082e-16,-3.7680300000000004e-17,2.048485,0.087879,0.072727,0.087879,-5.28879,7.024684e-16,-2.099331e-16,2.328104e-16,-2.072416e-16,-2.677993e-16,6.190334e-16
std,0.374586,0.276076,0.324272,0.869526,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.312893,0.500379,0.459264,1.0,1.0,1.0,1.272118,0.283333,0.259885,0.283333,0.324152,1.0,1.0,1.0,1.0,1.0,1.0
min,1.609438,0.0,1.609438,0.0,-3.662373,-2.467293,-5.871073,-5.091421,-4.394496,-4.418565,-3.754632,-3.348785,-3.216837,-3.172633,-11.58075,-5.232019,-3.636599,-2.940438,-2.929139,1.386294,0.0,0.0,-0.9548089,-5.083623,-0.7528092,1.0,0.0,0.0,0.0,-6.436265,-4.394784,-3.181396,-8.04393,-4.427596,-7.112805,-11.19502
25%,2.302585,2.351335,2.397895,2.22398,-0.469987,0.4046883,-0.4612405,-0.6365764,-0.601329,-0.5115817,-0.4528277,-0.5871021,-0.6207088,-0.6243953,-0.4618029,-0.670848,-0.5539804,-0.5860067,-0.6991645,2.484907,0.0,0.0,-0.9548089,-0.5394931,-0.7528092,1.0,0.0,0.0,0.0,-5.48849,-0.6724027,-0.6689232,-0.1959246,-0.688579,-0.6297333,-0.249442
50%,2.620874,2.493315,2.564949,2.457688,-0.1266258,0.4046883,1.808359e-17,0.05529772,0.1330839,0.1148624,0.0,0.04028699,0.04994918,0.04897075,0.04767894,0.04940965,-0.002971866,-0.01737687,0.03119944,2.656603,0.5,1.0,-0.9548089,-0.009820562,-0.7528092,2.0,0.0,0.0,0.0,-5.291951,-0.0127244,0.03844075,0.05256843,-0.0370604,-0.03320758,0.0262346
75%,2.833213,2.571282,2.70805,2.736184,0.4285995,0.4046883,0.623359,0.7289992,0.6210143,0.6465071,0.5810884,0.6401255,0.6218117,0.650139,0.5182741,0.6350291,0.6641533,0.6120514,0.6488245,2.772589,1.0,1.0,1.045743,0.5095662,0.9512181,2.0,0.0,0.0,0.0,-5.074419,0.6346455,0.5877769,0.2778732,0.6227563,0.5877876,0.3204732
max,4.442651,5.129928,4.682131,10.504761,4.304768,0.4046883,10.36658,2.569618,2.944274,3.605687,2.994238,3.845604,3.527957,3.555986,2.931753,3.56171,3.136262,3.995799,4.601807,4.799914,1.0,1.0,1.045743,6.45759,3.253929,5.0,1.0,1.0,1.0,-4.355265,3.327532,4.130466,14.29772,3.153526,4.921293,2.168526


In [13]:
cols_test_y1 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5', 'market_cap']

cols_test_y2 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

dataset_y1 = df[cols_test_y1].copy()
dataset_y2 = df[cols_test_y2].copy()
target_y1 = df[['Y1']].copy()
target_y2 = df[['Y2']].copy()

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [14]:
dataset_y1 = dataset_y1.values
target_y1 = target_y1.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y1, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False False False False False False  True False  True
 False False False False False  True False False False False False False
 False  True False  True]
[15 10  1  9 11 12 23 22  6  1  4  1  7 18 14 13  8  1 19 17  5  2 16 20
  3  1 21  1]


  y = column_or_1d(y, warn=True)


## Y2

In [15]:
dataset_y2 = dataset_y2 = dataset_y2.values
target_y2 = target_y2.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y2, target_y2)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False  True  True False False False False False False
 False False False False  True  True False False False False False  True
  True False False False]
[23 15  8 10  1  1 21  4 16  6  5  3 14  2 19  9  1  1 20 18 17 12  7  1
  1 11 22 13]


  y = column_or_1d(y, warn=True)


## Collumns we want to keep

In [17]:
df_output = df.copy()
cols_keep = ['C3', 'T3', 'T5', 'S2_Prime','industry_4', 'market_cap', 'C5','C6', 'C5_Prime','C6_Prime', 'industry_1', 'industry_2', 'Y1', 'Y2']
df_output = df.copy()
df_output = df_output[cols_keep]
df_output.head()

KeyError: "['industry_4market_cap'] not in index"

## Write to Datafile

In [None]:
df_output.to_csv('datasets/group10.csv', sep=',')