In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter
from sklearn import preprocessing

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [2]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)
df.head(1)

Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0


## Changing DataTypes

In [3]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df.dtypes

I2          object
I3         float64
P(IPO)     float64
P(H)       float64
P(L)       float64
P(1Day)    float64
C1         float64
C2         float64
C3         float64
C4         float64
C5         float64
C6         float64
C7         float64
T1         float64
T2         float64
T3         float64
T4         float64
T5         float64
S1         float64
S2         float64
S3         float64
dtype: object

## Cleanining/Imputing

For our predictor variables only

In [4]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

In [5]:
df_y1 = df['Y1']
df_y2= df['Y2']

## Now we need to change our industry columns

In [6]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [7]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [8]:
df = df.drop(columns = ['I2', 'I3', ])
df.head()

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764
ACHN,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,15.0,1,1,0,3.299697,0.0,1.0,1,0,0,0,0,0.009686,0.007144,0.013319,0.91706,0.539634,0.06163
ACLI,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,20.0,0,1,1,3.726269,5.0,3.0,0,0,1,0,0,0.004518,0.010047,0.011261,0.888469,0.587413,0.04855


## Getting rid of our 0 value columns

In [9]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [10]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Fix our Data for Normalization

In [11]:
#try l1 vs l2
df['C6']=1/np.log(df['C6'])
df_norm_l1 = pd.DataFrame(preprocessing.normalize(df,norm='l1'),columns = df.columns)
df_norm_l1.skew()

P(IPO)         0.000000
P(H)           0.000000
P(L)           0.000000
P(1Day)       14.838864
C1             7.037567
C2             0.000000
C3             0.000000
C4             0.000000
C5            -1.779035
C6             0.000000
C7             0.000000
T1             1.736245
T2             1.799528
T3             1.776134
T4             1.649219
T5             1.732116
S1             0.000000
S2             3.164731
S3             2.172735
P(Mid)         0.000000
Y1             0.000000
Y2             0.000000
C3_Prime       0.000000
C5_Prime       0.000000
C6_Prime       0.000000
industry       0.000000
industry_1     0.000000
industry_2     0.000000
industry_3     0.000000
industry_4     0.000000
industry_5     0.000000
S1_Prime       0.000000
S2_Prime       0.000000
S3_Prime       0.000000
T3_Prime       0.000000
T4_Prime       0.000000
T5_Prime       0.000000
dtype: float64

In [12]:
pos_skew = ['C1']

for col in pos_skew:
    df_norm_l1[col]=np.log(df_norm_l1[col])
    

#df_norm_l1['C6']=1/np.log(df_norm_l1['C6'])##

over_log = ['T1', 'T2', 'T3', 'T5', 'S2', 'S3']
for col in over_log:
    df_norm_l1[col]=1/np.log(df_norm_l1[col])

df_norm_l1['T4'] = np.sqrt(df_norm_l1['T4'])


df['industry_1'] = df['industry_1'].astype('category')
df['industry_2'] = df['industry_2'].astype('category')
df['industry_3'] = df['industry_3'].astype('category')
df['industry_4'] = df['industry_4'].astype('category')
df['industry_5'] = df['industry_5'].astype('category')
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,14.034621,15.610394,13.635394,25.934766,149.728788,0.859091,-0.218433,0.007282,49962850.0,0.062853,4.592588,468.701515,12888.641524,11515.500275,295.957576,669.996478,68.781327,120.89697,145.516667,14.622894,0.5,0.698485,0.477273,4.417255,5.404645,2.048485,0.005308,0.009055,0.011516,0.891504,0.62672,0.052171
std,5.954214,6.613816,5.790112,73.234948,152.817467,0.348192,1.534849,0.033318,105537700.0,0.002905,1.91664,176.37443,5415.681452,4805.168278,122.172959,298.043412,39.31667,85.542267,69.824545,6.187137,0.500379,0.459264,0.499862,5.133537,10.877569,1.272118,0.001755,0.00299,0.002842,0.071672,0.077686,0.007347
min,5.0,0.0,5.0,0.0,10.0,0.0,-9.229642,-0.162352,3693227.0,0.050356,-2.60369,132.0,3271.0,3003.0,0.0,68.675758,9.0,20.0,40.0,4.0,0.0,0.0,0.0,0.283223,0.0,1.0,-9.3e-05,0.002152,0.005326,0.0,0.0,-8.9e-05
25%,10.0,13.0,11.0,11.0,85.0,1.0,-0.926368,-0.013927,18775680.0,0.061348,3.724681,356.0,9275.5,8297.0,214.0,463.75,45.0,74.0,100.0,12.0,0.0,0.0,0.0,2.850622,0.0,1.0,0.004131,0.006966,0.009579,0.883417,0.580329,0.047544
50%,13.75,15.5,13.0,14.845,107.0,1.0,-0.218433,0.009125,27454380.0,0.06317,4.592588,446.0,12141.5,10853.0,280.0,627.0,60.0,101.5,135.0,14.25,0.5,1.0,0.0,3.731035,0.0,2.0,0.005027,0.008577,0.011301,0.897294,0.628861,0.051927
75%,17.0,17.0,15.0,20.485,155.25,1.0,0.738329,0.031571,50039860.0,0.064727,5.706326,553.25,15275.0,13793.5,355.25,801.25,85.0,144.0,174.0,16.0,1.0,1.0,1.0,4.857904,7.692308,2.0,0.006254,0.010521,0.012848,0.909693,0.674461,0.056489
max,85.0,135.0,108.0,1159.200562,2087.0,1.0,15.692704,0.092896,2138085000.0,0.073571,10.331464,1750.0,49056.0,43952.0,1058.0,2729.0,309.0,944.0,883.0,121.5,1.0,1.0,1.0,99.787255,100.0,5.0,0.012839,0.024605,0.029397,1.49054,0.826347,0.088329


In [13]:
df_norm_l1.skew()

P(IPO)         0.000000
P(H)           0.000000
P(L)           0.000000
P(1Day)       14.838864
C1            -0.408258
C2             0.000000
C3             0.000000
C4             0.000000
C5            -1.779035
C6             0.000000
C7             0.000000
T1             0.229093
T2            -0.015612
T3             0.003224
T4             0.505704
T5             0.233783
S1             0.000000
S2             0.053896
S3             0.243292
P(Mid)         0.000000
Y1             0.000000
Y2             0.000000
C3_Prime       0.000000
C5_Prime       0.000000
C6_Prime       0.000000
industry       0.000000
industry_1     0.000000
industry_2     0.000000
industry_3     0.000000
industry_4     0.000000
industry_5     0.000000
S1_Prime       0.000000
S2_Prime       0.000000
S3_Prime       0.000000
T3_Prime       0.000000
T4_Prime       0.000000
T5_Prime       0.000000
dtype: float64

In [14]:
data_scaled = pd.DataFrame(preprocessing.scale(df_norm_l1),columns = df.columns)
data_scaled.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,7.536059000000001e-17,6.459479e-17,-8.074349000000001e-17,2.15316e-17,-7.051598e-16,9.554647000000001e-17,5.3829e-18,-9.420074e-18,-9.446989e-15,9.150929000000001e-17,4.84461e-17,-1.426468e-16,-5.3829000000000005e-17,2.637621e-16,3.22974e-16,-1.294587e-15,-1.507212e-16,-1.372639e-16,-7.105427e-16,-1.399554e-16,-7.805204000000001e-17,2.422305e-17,1.61487e-17,5.921189000000001e-17,1.480297e-17,2.6914500000000003e-17,-6.997769e-17,5.921189000000001e-17,1.07658e-17,-9.150929000000001e-17,1.2111520000000002e-17,-3.22974e-17,-6.863197000000001e-17,8.074349000000001e-17,1.61487e-17,-2.879851e-16,-1.103494e-16
std,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758,1.000758
min,-1.335152,-1.437321,-1.407594,-0.2019819,-4.15914,-1.255989,-9.082956,-5.47905,-5.72328,-1.177328,-3.367192,-2.804945,-3.246969,-3.277177,-2.944551,-2.990323,-1.287356,-4.116852,-3.605643,-1.402493,-0.6844895,-0.8914115,-0.5929071,-1.248749,-0.3806949,-0.7971545,-0.6618877,-0.4561213,-0.2442388,-0.2106973,-0.2194878,-1.17281,-1.269664,-1.210812,-1.249124,-1.263922,-1.316299
25%,-0.6917326,-0.6843607,-0.6851459,-0.1559882,-0.5855629,-0.7639116,-0.2819602,-0.4259813,-0.429486,-0.6508171,-0.6392694,-0.6851521,-0.6348955,-0.6408622,-0.6857275,-0.6927364,-0.6669539,-0.6608163,-0.6452343,-0.6919782,-0.6844895,-0.8914115,-0.5929071,-0.5757421,-0.3806949,-0.5210174,-0.6618877,-0.4561213,-0.2442388,-0.2106973,-0.2194878,-0.6403091,-0.6897713,-0.7235204,-0.6711042,-0.6650453,-0.7169546
50%,-0.2003228,-0.1805816,-0.1914758,-0.1220686,0.05712208,-0.08562474,0.1212091,-0.08361178,0.2020367,-0.1806482,-0.2047311,-0.04002388,-0.01515673,-0.02151607,-0.07201223,-0.0008482378,-0.2272167,-0.0266332,-0.02549804,-0.1903206,-0.6396348,-0.2046725,-0.5929071,-0.1595166,-0.3806949,-0.3016656,-0.6618877,-0.4561213,-0.2442388,-0.2106973,-0.2194878,-0.2674227,-0.2511434,-0.1897303,-0.1728633,-0.1926026,-0.1878877
75%,0.3441866,0.373825,0.3652025,-0.067561,0.6244228,0.58236,0.4156076,0.4227347,0.7112077,0.3010852,0.3600745,0.637278,0.6775592,0.6741009,0.5886326,0.6510881,0.3645604,0.6918209,0.6627343,0.363913,0.493838,0.4663479,0.3475235,0.3524137,0.06091536,0.09171561,0.5988427,0.02025222,-0.2442388,-0.2106973,-0.2194878,0.2951765,0.3456082,0.385524,0.3242168,0.3211763,0.3760821
max,4.650881,6.341198,6.114932,17.28841,3.447841,6.214129,9.864176,5.90213,1.446526,6.769596,6.006855,4.158965,3.788799,3.793432,3.738779,4.218421,6.459615,4.28016,4.081937,6.245324,7.777504,5.958382,7.717664,9.126727,12.3129,8.331038,8.373264,9.630516,11.6056,9.682879,12.87086,6.713986,5.878518,5.715144,6.846821,7.852025,6.705677


In [15]:
min_max_scaler = preprocessing.MinMaxScaler()
data_min_max = pd.DataFrame(min_max_scaler.fit_transform(df_norm_l1),columns = df.columns)


data_min_max.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.223045,0.184781,0.187117,0.011548,0.546753,0.168135,0.479384,0.481413,0.798248,0.148149,0.359204,0.402783,0.461495,0.463493,0.440581,0.414819,0.166175,0.490276,0.469022,0.183385,0.08089,0.130137,0.071344,0.120356,0.029991,0.087329,0.073257,0.04522,0.020611,0.021296,0.016767,0.148705,0.177621,0.174822,0.15429,0.13865,0.164087
std,0.167182,0.128657,0.133035,0.057218,0.131558,0.133968,0.052818,0.087931,0.13958,0.12593,0.106758,0.143706,0.142239,0.141538,0.149739,0.138826,0.129181,0.11918,0.130179,0.130855,0.118265,0.146101,0.12042,0.096454,0.07884,0.109634,0.110763,0.099216,0.084453,0.101152,0.07645,0.12689,0.140002,0.144494,0.123612,0.109781,0.124752
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.107487,0.0968,0.096038,0.00263,0.469776,0.065873,0.464503,0.443985,0.738345,0.066253,0.291008,0.304397,0.371256,0.372855,0.337979,0.318722,0.080083,0.411579,0.38509,0.092904,0.0,0.0,0.0,0.064865,0.0,0.030251,0.0,0.0,0.0,0.0,0.0,0.067518,0.081125,0.070357,0.071396,0.065695,0.074713
50%,0.18958,0.161565,0.161664,0.004569,0.554262,0.156673,0.485781,0.474067,0.826426,0.125417,0.337364,0.397036,0.45934,0.46045,0.429807,0.414701,0.136846,0.487104,0.465705,0.158499,0.005301,0.100257,0.0,0.104981,0.0,0.054281,0.0,0.0,0.0,0.0,0.0,0.114798,0.142487,0.147428,0.132938,0.117521,0.140665
75%,0.280543,0.232839,0.235665,0.007685,0.628839,0.246094,0.501319,0.518556,0.897442,0.186036,0.397616,0.494295,0.557797,0.558831,0.528656,0.505138,0.213234,0.572665,0.55523,0.230969,0.139249,0.198219,0.113161,0.154322,0.03479,0.097376,0.139536,0.047228,0.0,0.0,0.0,0.186132,0.22597,0.230486,0.194337,0.173882,0.210968
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Creyting More Variables for us to use


## Normalize the Rest of the DF

df['market_cap'] =df['P(1Day)'] * df['C6']
df['market_cap'] = np.where(df['market_cap'] ==0, 0.001, df['market_cap'])
df.describe()

## Feature Selection

In [16]:
cols_test_y1 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

cols_test_y2 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

dataset_y1 = data_min_max[cols_test_y1].copy()
dataset_y2 = data_min_max[cols_test_y2].copy()
target_y1 = df_y1.copy()
target_y2 = df_y2.copy()

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [17]:
dataset_y1 = dataset_y1.values
target_y1 = target_y1.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y1, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False  True False False False False False
  True  True False  True False  True False False False False False False
 False  True False]
[ 8 12  3  2  4 15  1  9 18 22 17 10  1  1 20  1  7  1 16 21 19  6  5 14
 13  1 11]


## Y2

In [18]:
dataset_y2 = dataset_y2 = dataset_y2.values
target_y2 = target_y2.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y2, target_y2)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True False False False False False False False False False
 False  True False False False  True False False False False False False
 False  True False  True]
[15  1  1 19  6 22  9 21 23 16  7 20  2  1 17  8  5  1  4  3 18 10 14 13
 11  1 12  1]


## Collumns we want to keep

In [19]:
cols_keep = ['C7', 'S1', 'S2', 'C3_Prime','S2_Prime', 'industry_4', 'C2','C3', 'S2','C6_Prime', 'industry_3', 'industry_5']
df_output = data_min_max.copy()
df_output = df_output[cols_keep]
df_output['Y1']=df_y1.values
df_output['Y2']=df_y2.values
df_output.head()

Unnamed: 0,C7,S1,S2,C3_Prime,S2_Prime,industry_4,C2,C3,S2.1,C6_Prime,industry_3,industry_5,Y1,Y2
0,0.302904,0.100461,0.517722,0.090534,0.108262,0.0,0.136846,0.510224,0.517722,0.042462,0.0,0.0,0,1
1,0.315389,0.165312,0.333894,0.0,0.186281,0.0,0.0,0.462825,0.333894,0.0,0.0,0.0,1,0
2,0.319898,0.245309,0.575044,0.0,0.180402,0.0,0.332752,0.448673,0.575044,0.0,0.0,0.0,1,0
3,0.339262,0.366536,0.464781,0.0,0.233426,0.0,0.37731,0.447784,0.464781,0.0,0.0,0.0,1,1
4,0.388743,0.146174,0.431454,0.120591,0.158148,0.0,0.182279,0.495896,0.431454,0.025452,0.182279,0.0,0,1


## Write to Datafile

In [51]:
# Separating out the features
x_1 = df_output.iloc[:, :6].values
x_2 = df_output.iloc[:,6:12].values
# Separating out the target
y_1 = df_output.iloc[:,12].values
y_2 = df_output.iloc[:,13].values
y_2[0:5]
#df.reset_index(drop=True, inplace=True)

array([1, 0, 0, 1, 1])

In [62]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents1 = pca.fit_transform(x_1)
principalDf1 = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2', 'principal component 3'])
principalDf1.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.099988,0.02842,-0.0351
1,0.029675,-0.133191,0.028776
2,-0.040178,-0.05511,0.007335
3,0.10182,-0.139768,-0.003695
4,0.028553,0.012134,-0.041996


In [64]:
pca = PCA(n_components=3)
principalComponents2 = pca.fit_transform(x_2)
principalDf2 = pd.DataFrame(data = principalComponents2
             , columns = ['principal component 1_2', 'principal component 2_2', 'principal component 3_2'])
principalDf2.head()

Unnamed: 0,principal component 1_2,principal component 2_2,principal component 3_2
0,-0.044507,-0.003172,-0.016159
1,-0.05333,0.207888,0.033814
2,0.079786,-0.163631,-0.040356
3,0.177249,-0.10046,-0.042366
4,0.070455,0.022115,0.150465


In [68]:
df_output = principalDf1.copy()
df_output =pd.concat([principalDf1, principalDf2], axis=1)
df_output['Y1']=df_y1.values
df_output['Y2']=df_y2.values
df_output.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 1_2,principal component 2_2,principal component 3_2,Y1,Y2
0,-0.099988,0.02842,-0.0351,-0.044507,-0.003172,-0.016159,0,1
1,0.029675,-0.133191,0.028776,-0.05333,0.207888,0.033814,1,0
2,-0.040178,-0.05511,0.007335,0.079786,-0.163631,-0.040356,1,0
3,0.10182,-0.139768,-0.003695,0.177249,-0.10046,-0.042366,1,1
4,0.028553,0.012134,-0.041996,0.070455,0.022115,0.150465,0,1


In [69]:
df_output.to_csv('datasets/group13.csv', sep=',')