In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter
from sklearn import preprocessing

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [2]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)
df.head(1)

Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0


## Changing DataTypes

In [3]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df.dtypes

I2          object
I3         float64
P(IPO)     float64
P(H)       float64
P(L)       float64
P(1Day)    float64
C1         float64
C2         float64
C3         float64
C4         float64
C5         float64
C6         float64
C7         float64
T1         float64
T2         float64
T3         float64
T4         float64
T5         float64
S1         float64
S2         float64
S3         float64
dtype: object

## Cleanining/Imputing

For our predictor variables only

In [4]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

In [5]:
df_y1 = df['Y1']
df_y2= df['Y2']

## Now we need to change our industry columns

In [6]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [7]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [8]:
df = df.drop(columns = ['I2', 'I3'])
df.head()

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764
ACHN,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,15.0,1,1,0,3.299697,0.0,1.0,1,0,0,0,0,0.009686,0.007144,0.013319,0.91706,0.539634,0.06163
ACLI,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,20.0,0,1,1,3.726269,5.0,3.0,0,0,1,0,0,0.004518,0.010047,0.011261,0.888469,0.587413,0.04855


## Getting rid of our 0 value columns

In [9]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [10]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Fix our Data for Normalization

In [11]:
#create our two dataframes we are working with 
df_hold=df.copy()
df_hold.head()

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,1.0,1.508104,0.029074,40962052.0,10600000.0,3.938568,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,8.0,10.0,8.0,7.25,259.0,0.0,-1.17446,-0.013352,28869196.0,2400000.0,3.255632,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,7.0,14.0,12.0,6.7,90.0,1.0,-1.074337,0.020715,16845668.0,5000000.0,1.998503,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764
ACHN,11.5,16.0,14.0,12.39,209.0,1.0,-0.969052,0.020023,14848637.0,4500000.0,2.14312,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,15.0,1,1,0,3.299697,0.0,1.0,1,0,0,0,0,0.009686,0.007144,0.013319,0.91706,0.539634,0.06163
ACLI,21.0,21.0,19.0,56.599998,80.0,1.0,0.412129,-0.034895,30741716.0,8250000.0,6.449361,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,20.0,0,1,1,3.726269,5.0,3.0,0,0,1,0,0,0.004518,0.010047,0.011261,0.888469,0.587413,0.04855


In [12]:
#try l1 vs l2
df['C6']=1/np.log(df['C6'])
df = df.drop(columns = ['C2','C3_Prime','industry', 'industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5', 'Y1', 'Y2'])
df = pd.DataFrame(preprocessing.normalize(df,norm='l1'),columns = df.columns)
df.skew()

P(IPO)       0.000000
P(H)         0.000000
P(L)         0.000000
P(1Day)     14.838865
C1           7.037575
C3           0.000000
C4           0.000000
C5          -1.779095
C6           0.000000
C7           0.000000
T1           1.736247
T2           1.799530
T3           1.776135
T4           1.649220
T5           1.732117
S1           0.000000
S2           3.164730
S3           2.172735
P(Mid)       0.000000
C5_Prime     0.000000
C6_Prime     0.000000
S1_Prime     0.000000
S2_Prime     0.000000
S3_Prime     0.000000
T3_Prime     0.000000
T4_Prime     0.000000
T5_Prime     0.000000
dtype: float64

In [13]:
pos_skew = ['C1']

for col in pos_skew:
    df[col]=np.log(df[col])
    

#df_norm_l1['C6']=1/np.log(df_norm_l1['C6'])##

over_log = ['T1', 'T2', 'T3', 'T5', 'S2', 'S3']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])


In [14]:
df.skew()

P(IPO)       0.000000
P(H)         0.000000
P(L)         0.000000
P(1Day)     14.838865
C1          -0.408258
C3           0.000000
C4           0.000000
C5          -1.779095
C6           0.000000
C7           0.000000
T1           0.229093
T2          -0.015612
T3           0.003223
T4           0.505705
T5           0.233782
S1           0.000000
S2           0.053895
S3           0.243292
P(Mid)       0.000000
C5_Prime     0.000000
C6_Prime     0.000000
S1_Prime     0.000000
S2_Prime     0.000000
S3_Prime     0.000000
T3_Prime     0.000000
T4_Prime     0.000000
T5_Prime     0.000000
dtype: float64

df = pd.DataFrame(preprocessing.scale(df),columns = df.columns)
df.describe()

In [15]:
min_max_scaler = preprocessing.MinMaxScaler()
df = pd.DataFrame(min_max_scaler.fit_transform(df),columns = df.columns)


df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),C5_Prime,C6_Prime,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.223044,0.184781,0.187117,0.011548,0.546753,0.479384,0.481413,0.798238,0.148149,0.359204,0.402783,0.461495,0.463493,0.440581,0.414819,0.166175,0.490276,0.469022,0.183385,0.120356,0.029991,0.148705,0.17762,0.174822,0.15429,0.138649,0.164086
std,0.167182,0.128657,0.133035,0.057218,0.131558,0.052818,0.087931,0.139589,0.12593,0.106758,0.143706,0.142239,0.141538,0.149739,0.138826,0.129181,0.11918,0.130179,0.130855,0.096454,0.07884,0.12689,0.140002,0.144494,0.123612,0.109781,0.124752
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.107487,0.0968,0.096038,0.00263,0.469776,0.464503,0.443985,0.738304,0.066253,0.291008,0.304397,0.371257,0.372856,0.337979,0.318722,0.080083,0.411579,0.38509,0.092904,0.064865,0.0,0.067518,0.081124,0.070357,0.071396,0.065695,0.074713
50%,0.189579,0.161565,0.161663,0.004569,0.554262,0.485781,0.474067,0.826409,0.125417,0.337363,0.397036,0.459341,0.46045,0.429806,0.414701,0.136846,0.487104,0.465705,0.158499,0.104981,0.0,0.114798,0.142487,0.147428,0.132938,0.117521,0.140665
75%,0.280543,0.232839,0.235665,0.007685,0.628838,0.501319,0.518557,0.897429,0.186036,0.397615,0.494295,0.557797,0.558832,0.528656,0.505138,0.213234,0.572665,0.55523,0.230968,0.154322,0.03479,0.186132,0.225969,0.230486,0.194337,0.173882,0.210968
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
#add back the columns we didnt want to normalize/standradrzie
cols_add_back = ['C2','C3_Prime','industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5', 'Y1', 'Y2']
df_hold.reset_index(drop=True, inplace=True)
for col in cols_add_back:
    df[col] = df_hold[col]
df.describe()

Unnamed: 0,P(IPO),P(H),P(L),P(1Day),C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),C5_Prime,C6_Prime,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,C2,C3_Prime,industry_1,industry_2,industry_3,industry_4,industry_5,Y1,Y2
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.223044,0.184781,0.187117,0.011548,0.546753,0.479384,0.481413,0.798238,0.148149,0.359204,0.402783,0.461495,0.463493,0.440581,0.414819,0.166175,0.490276,0.469022,0.183385,0.120356,0.029991,0.148705,0.17762,0.174822,0.15429,0.138649,0.164086,0.859091,0.477273,0.448485,0.30303,0.087879,0.072727,0.087879,0.5,0.698485
std,0.167182,0.128657,0.133035,0.057218,0.131558,0.052818,0.087931,0.139589,0.12593,0.106758,0.143706,0.142239,0.141538,0.149739,0.138826,0.129181,0.11918,0.130179,0.130855,0.096454,0.07884,0.12689,0.140002,0.144494,0.123612,0.109781,0.124752,0.348192,0.499862,0.497716,0.459917,0.283333,0.259885,0.283333,0.500379,0.459264
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.107487,0.0968,0.096038,0.00263,0.469776,0.464503,0.443985,0.738304,0.066253,0.291008,0.304397,0.371257,0.372856,0.337979,0.318722,0.080083,0.411579,0.38509,0.092904,0.064865,0.0,0.067518,0.081124,0.070357,0.071396,0.065695,0.074713,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.189579,0.161565,0.161663,0.004569,0.554262,0.485781,0.474067,0.826409,0.125417,0.337363,0.397036,0.459341,0.46045,0.429806,0.414701,0.136846,0.487104,0.465705,0.158499,0.104981,0.0,0.114798,0.142487,0.147428,0.132938,0.117521,0.140665,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0
75%,0.280543,0.232839,0.235665,0.007685,0.628838,0.501319,0.518557,0.897429,0.186036,0.397615,0.494295,0.557797,0.558832,0.528656,0.505138,0.213234,0.572665,0.55523,0.230968,0.154322,0.03479,0.186132,0.225969,0.230486,0.194337,0.173882,0.210968,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Selection

In [17]:
cols_test_y1 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

cols_test_y2 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

dataset_y1 = df[cols_test_y1].copy()
dataset_y2 = df[cols_test_y2].copy()
target_y1 = df['Y1']
target_y2 = df['Y2']

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [18]:
dataset_y1 = dataset_y1.values
target_y1 = target_y1.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y1, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False  True False  True False False False False False
  True  True False False False  True False False False False False False
 False False False]
[ 7 10  1  2  1  9  1  6 15 21 14  4  1  1 19 12  5  1 22 13 20  8 17 18
 11  3 16]


## Y2

In [19]:
dataset_y2 = dataset_y2 = dataset_y2.values
target_y2 = target_y2.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y2, target_y2)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False False False False False False False False False
  True  True False False False  True False False False False False False
 False  True False  True]
[ 9  8  1 13  3 23  6 20 21 18  2 19  1  1 15 14  7  1  5  4 22 10 12 16
 11  1 17  1]


## Collumns we want to keep

In [20]:
cols_keep = ['C3', 'C5', 'C7', 'S1','S3', 'T3_Prime', 'C3','S1', 'S2','C6_Prime', 'industry_3', 'industry_5']
df_output = df[cols_keep].copy()
df_output['Y1']=df_y1.values
df_output['Y2']=df_y2.values
df_output.head()

Unnamed: 0,C3,C5,C7,S1,S3,T3_Prime,C3.1,S1.1,S2,C6_Prime,industry_3,industry_5,Y1,Y2
0,0.510224,0.867891,0.302904,0.100461,0.515752,0.090759,0.510224,0.100461,0.517722,0.042462,0,0,0,1
1,0.462825,0.676412,0.315389,0.165311,0.349676,0.127223,0.462825,0.165311,0.333894,0.0,0,0,1,0
2,0.448673,0.865253,0.319898,0.245309,0.507248,0.220803,0.448673,0.245309,0.575044,0.0,0,0,1,0
3,0.447784,0.755935,0.339262,0.366536,0.370206,0.252492,0.447784,0.366536,0.464781,0.0,0,0,1,1
4,0.495896,0.795346,0.388743,0.146174,0.43028,0.118177,0.495896,0.146174,0.431454,0.025452,1,0,0,1


## PCA

In [21]:
# Separating out the features
x_1 = df_output.iloc[:, :6].values
x_2 = df_output.iloc[:,6:12].values
# Separating out the target
y_1 = df_output.iloc[:,12].values
y_2 = df_output.iloc[:,13].values
#df.reset_index(drop=True, inplace=True)

In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents1 = pca.fit_transform(x_1)
principalDf1 = pd.DataFrame(data = principalComponents1
             , columns = ['principal component 1', 'principal component 2', 'principal component 3'])
principalDf1.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.1344,0.012658,7.855331e-07
1,0.095122,0.093564,-0.07782236
2,-0.001274,0.014014,0.02396647
3,0.196516,0.09469,0.01315715
4,0.004332,-0.003917,-0.01200637


In [23]:
sum(pca.explained_variance_ratio_)

0.9141809630708942

In [24]:
pca = PCA(n_components=3)
principalComponents2 = pca.fit_transform(x_2)
principalDf2 = pd.DataFrame(data = principalComponents2
             , columns = ['principal component 1_2', 'principal component 2_2', 'principal component 3_2'])
principalDf2.head()

Unnamed: 0,principal component 1_2,principal component 2_2,principal component 3_2
0,0.000669,-0.120218,-0.069552
1,-0.002557,-0.123238,0.095417
2,0.000671,-0.129014,-0.005741
3,-0.000485,-0.134539,0.156945
4,-0.707134,0.584266,0.039571


In [25]:
sum(pca.explained_variance_ratio_)

0.9377976092478004

In [26]:
df_output = principalDf1.copy()
df_output =pd.concat([principalDf1, principalDf2], axis=1)
df_output['Y1']=df_y1.values
df_output['Y2']=df_y2.values
df_output.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 1_2,principal component 2_2,principal component 3_2,Y1,Y2
0,-0.1344,0.012658,7.855331e-07,0.000669,-0.120218,-0.069552,0,1
1,0.095122,0.093564,-0.07782236,-0.002557,-0.123238,0.095417,1,0
2,-0.001274,0.014014,0.02396647,0.000671,-0.129014,-0.005741,1,0
3,0.196516,0.09469,0.01315715,-0.000485,-0.134539,0.156945,1,1
4,0.004332,-0.003917,-0.01200637,-0.707134,0.584266,0.039571,0,1


In [28]:
df_output.to_csv('datasets2/group13.csv', sep=',')