In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import backend as bk
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from pandas import ExcelWriter
from sklearn import preprocessing

plt.style.use('ggplot')
pd.options.display.max_columns = None

## Import File and Reset Index

In [2]:
df = pd.read_excel('Competition1_raw_data.xlsx', na_values='-')
df.set_index('I1', inplace=True)
df.head(1)

Unnamed: 0_level_0,I2,I3,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AATI,ADVANCED ANALOGIC TECHNOLOGIES INC,3674,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0


## Changing DataTypes

In [3]:
to_float = ['I3','P(IPO)','P(H)', 'P(L)', 'P(1Day)', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'T1', 'T2', 'T3', 'T4', 'T5', 'S1', 'S2', 'S3']

for col in to_float:
    df[col]=pd.to_numeric(df[col], errors='coerce')
df.dtypes

I2          object
I3         float64
P(IPO)     float64
P(H)       float64
P(L)       float64
P(1Day)    float64
C1         float64
C2         float64
C3         float64
C4         float64
C5         float64
C6         float64
C7         float64
T1         float64
T2         float64
T3         float64
T4         float64
T5         float64
S1         float64
S2         float64
S3         float64
dtype: object

## Cleanining/Imputing

For our predictor variables only

In [4]:
df = df.dropna(subset=['P(1Day)'])
## Creating Necessary Variables Y1, Y2, C3', C5', and C6'
## Code found on https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column
df['P(Mid)'] = (df['P(H)'] + df['P(L)']) / 2
df['Y1'] = np.where(df['P(IPO)'] < df['P(Mid)'], 1,0)
df['Y2'] = np.where(df['P(IPO)'] < df['P(1Day)'], 1,0)
df["C3_Prime"] = np.where(df['C3'] > 0, 1, 0)
df["C5_Prime"] = df['C5'] / df['C6']
df["C6_Prime"] = np.where(df['P(IPO)'] > df['P(Mid)'], ((df['P(IPO)'] - df['P(Mid)']) / df['P(Mid)']) * 100,0)

In [5]:
df_y1 = df['Y1']
df_y2= df['Y2']

## Now we need to change our industry columns

In [6]:
df['industry'] = df['I3'].apply(bk.assign_sector)

df['industry'] = df['industry'].replace(['Mining', 'Wholesale Trade', 'Finance/Insurance/Realestate', 'Non Classafiable Establishments',
                                            'Agriculture/Forestry/Fishing', 'Construction', 'NaN'],'Other')
df['industry'].value_counts()

df['industry'] = df['industry'].replace({'Manufacturing': 1, 'Services': 2, 'Transportation/Communications/Utulities': 3, 'Retail Trade': 4,'Other': 5})

df['industry'] = df['industry'].fillna(5)

df['industry_1'] = np.where(df['industry'] ==1, 1, 0)
df['industry_2'] = np.where(df['industry'] ==2, 1, 0)
df['industry_3'] = np.where(df['industry'] ==3, 1, 0)
df['industry_4'] = np.where(df['industry'] ==4, 1, 0)
df['industry_5'] = np.where(df['industry'] ==5, 1, 0)


## Now we need to create our ratio columns


In [7]:
t2_mean = df['T2'].mean()
df['T2'] = np.where(df['T2'] ==0, t2_mean, df['T2'])
df["S1_Prime"]= df['S1']/df['T2']
df["S2_Prime"]= df['S2']/df['T2']
df["S3_Prime"]= df['S3']/df['T2']
df["T3_Prime"]= df['T3']/df['T2']
df["T4_Prime"]= df['T4']/df['T1']
df["T5_Prime"]= df['T5']/df['T2']

## No longer need these columns

In [8]:
df = df.drop(columns = ['I2', 'I3'])
df.head()

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,1.0,3.43,0.029074,40962052.0,10600000.0,51.345,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,8.0,10.0,8.0,7.25,259.0,0.0,-1.62,-0.013352,28869196.0,2400000.0,25.936,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,7.0,14.0,12.0,6.7,90.0,1.0,-1.24,0.020715,16845668.0,5000000.0,7.378,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764
ACHN,11.5,16.0,14.0,12.39,209.0,1.0,-0.91,0.020023,14848637.0,4500000.0,8.526,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,15.0,1,1,0,3.299697,0.0,1.0,1,0,0,0,0,0.009686,0.007144,0.013319,0.91706,0.539634,0.06163
ACLI,21.0,21.0,19.0,56.599998,80.0,1.0,0.07,-0.034895,30741716.0,8250000.0,632.298,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,20.0,0,1,1,3.726269,5.0,3.0,0,0,1,0,0,0.004518,0.010047,0.011261,0.888469,0.587413,0.04855


## Getting rid of our 0 value columns

In [9]:
#T3 need to get rid of thier zeros.
t3_mean = df['T3'].mean()
df['T3'] = np.where(df['T3'] ==0, t3_mean, df['T3'])
print(t3_mean)

#S1 need to get rid of thier zeros.
s1_mean = df['S1'].mean()
df['S1'] = np.where(df['S1'] ==-1, s1_mean, df['S1'])
print(s1_mean)

#T5 need to get rid of thier zeros.
t5_mean = df['T5'].mean()
df['T5'] = np.where(df['T5'] ==-1, s1_mean, df['T5'])
print(t5_mean)

11463.39393939394
68.67575757575757
669.8909090909091


## More Imputation

In [10]:
#nromalize qnd impute C7 values
df['C7'] = np.log(df['C7'])
c7_mean = df['C7'].mean()
print(c7_mean)
df['C7'] = df['C7'].fillna(c7_mean)

#normalize and impute C3 values
df['C3'] = np.cbrt(df['C3'])
c3_mean = df['C3'].mean()
print(c3_mean)
df['C3'] = df['C3'].fillna(c3_mean)

4.592588484196865
-0.21843318437028425


## Fix our Data for Normalization

In [11]:
#create our two dataframes we are working with 
df_hold=df.copy()
df_hold.head()

Unnamed: 0_level_0,P(IPO),P(H),P(L),P(1Day),C1,C2,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,P(Mid),Y1,Y2,C3_Prime,C5_Prime,C6_Prime,industry,industry_1,industry_2,industry_3,industry_4,industry_5,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
I1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
AATI,10.0,9.5,8.5,11.87,122.0,1.0,1.508104,0.029074,40962052.0,10600000.0,3.938568,470.0,12719.0,11560.0,301.0,690.0,62.0,117.0,139.0,9.0,0,1,1,3.864345,11.111111,1.0,1,0,0,0,0,0.004875,0.009199,0.010929,0.908876,0.640426,0.05425
ABPI,8.0,10.0,8.0,7.25,259.0,0.0,-1.17446,-0.013352,28869196.0,2400000.0,3.255632,791.0,21792.0,19585.0,510.0,1120.0,71.0,242.0,237.0,9.0,1,0,0,12.028832,0.0,1.0,1,0,0,0,0,0.003258,0.011105,0.010876,0.898724,0.644753,0.051395
ACAD,7.0,14.0,12.0,6.7,90.0,1.0,-1.074337,0.020715,16845668.0,5000000.0,1.998503,201.0,5262.0,4785.0,128.0,325.0,61.0,33.0,60.0,13.0,1,0,0,3.369134,0.0,1.0,1,0,0,0,0,0.011593,0.006271,0.011403,0.90935,0.636816,0.061764
ACHN,11.5,16.0,14.0,12.39,209.0,1.0,-0.969052,0.020023,14848637.0,4500000.0,2.14312,328.0,8259.0,7574.0,177.0,509.0,80.0,59.0,110.0,15.0,1,1,0,3.299697,0.0,1.0,1,0,0,0,0,0.009686,0.007144,0.013319,0.91706,0.539634,0.06163
ACLI,21.0,21.0,19.0,56.599998,80.0,1.0,0.412129,-0.034895,30741716.0,8250000.0,6.449361,572.0,14830.0,13176.0,336.0,720.0,67.0,149.0,167.0,20.0,0,1,1,3.726269,5.0,3.0,0,0,1,0,0,0.004518,0.010047,0.011261,0.888469,0.587413,0.04855


In [12]:
#try l1 vs l2
df['C6']=1/np.log(df['C6'])
df = df.drop(columns = ['P(IPO)','P(H)','P(L)','P(1Day)','P(Mid)','C2','C3_Prime','industry', 'industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5', 'Y1', 'Y2'])
df = pd.DataFrame(preprocessing.normalize(df,norm='l2'),columns = df.columns)
df.skew()

C1          7.045573
C3          0.000000
C4          0.000000
C5          0.000000
C6          0.000000
C7          0.000000
T1          1.742721
T2          1.807072
T3          1.783650
T4          1.655954
T5          1.739481
S1          0.000000
S2          3.175224
S3          2.181432
C5_Prime    0.000000
C6_Prime    0.000000
S1_Prime    0.000000
S2_Prime    0.000000
S3_Prime    0.000000
T3_Prime    0.000000
T4_Prime    0.000000
T5_Prime    0.000000
dtype: float64

In [13]:
pos_skew = ['C1']

for col in pos_skew:
    df[col]=np.log(df[col])
    

#df_norm_l1['C6']=1/np.log(df_norm_l1['C6'])##

over_log = ['T1', 'T2', 'T3', 'T5', 'S2', 'S3']
for col in over_log:
    df[col]=1/np.log(df[col])

df['T4'] = np.sqrt(df['T4'])


In [14]:
df.skew()

C1         -0.407655
C3          0.000000
C4          0.000000
C5          0.000000
C6          0.000000
C7          0.000000
T1          0.226609
T2         -0.018567
T3          0.000290
T4          0.509403
T5          0.231179
S1          0.000000
S2          0.052038
S3          0.241062
C5_Prime    0.000000
C6_Prime    0.000000
S1_Prime    0.000000
S2_Prime    0.000000
S3_Prime    0.000000
T3_Prime    0.000000
T4_Prime    0.000000
T5_Prime    0.000000
dtype: float64

df = pd.DataFrame(preprocessing.scale(df),columns = df.columns)
df.describe()

In [15]:
min_max_scaler = preprocessing.MinMaxScaler()
df = pd.DataFrame(min_max_scaler.fit_transform(df),columns = df.columns)


df.describe()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,C5_Prime,C6_Prime,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.546688,0.479567,0.481086,0.938706,0.147657,0.359102,0.403227,0.462142,0.464133,0.439797,0.415318,0.165689,0.490646,0.469433,0.120426,0.029993,0.148204,0.177386,0.17459,0.153774,0.138187,0.163532
std,0.131565,0.052826,0.087804,0.099672,0.125684,0.106706,0.143703,0.142188,0.141488,0.14964,0.13881,0.128984,0.119168,0.130162,0.09656,0.078893,0.126603,0.13997,0.144457,0.123371,0.109574,0.124493
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.469675,0.464696,0.443734,0.931013,0.065965,0.290975,0.304891,0.371986,0.373578,0.337266,0.319287,0.079772,0.411871,0.385476,0.064891,0.0,0.067249,0.08094,0.070209,0.071093,0.065417,0.074399
50%,0.554221,0.485963,0.473743,0.96915,0.124933,0.337215,0.397552,0.460018,0.461145,0.428971,0.415238,0.136376,0.487492,0.466174,0.104967,0.0,0.114345,0.14218,0.147219,0.132426,0.117061,0.140094
75%,0.628799,0.501494,0.518114,0.98882,0.185548,0.397546,0.49476,0.558422,0.559451,0.527729,0.505651,0.212557,0.573047,0.555633,0.154507,0.034775,0.185485,0.225611,0.230205,0.193606,0.17322,0.210276
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
#add back the columns we didnt want to normalize/standradrzie
cols_add_back = ['C2','C3_Prime','industry_1', 'industry_2', 'industry_3', 'industry_4', 'industry_5', 'Y1', 'Y2']
df_hold.reset_index(drop=True, inplace=True)
for col in cols_add_back:
    df[col] = df_hold[col]
df.describe()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,T5,S1,S2,S3,C5_Prime,C6_Prime,S1_Prime,S2_Prime,S3_Prime,T3_Prime,T4_Prime,T5_Prime,C2,C3_Prime,industry_1,industry_2,industry_3,industry_4,industry_5,Y1,Y2
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,0.546688,0.479567,0.481086,0.938706,0.147657,0.359102,0.403227,0.462142,0.464133,0.439797,0.415318,0.165689,0.490646,0.469433,0.120426,0.029993,0.148204,0.177386,0.17459,0.153774,0.138187,0.163532,0.859091,0.477273,0.448485,0.30303,0.087879,0.072727,0.087879,0.5,0.698485
std,0.131565,0.052826,0.087804,0.099672,0.125684,0.106706,0.143703,0.142188,0.141488,0.14964,0.13881,0.128984,0.119168,0.130162,0.09656,0.078893,0.126603,0.13997,0.144457,0.123371,0.109574,0.124493,0.348192,0.499862,0.497716,0.459917,0.283333,0.259885,0.283333,0.500379,0.459264
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.469675,0.464696,0.443734,0.931013,0.065965,0.290975,0.304891,0.371986,0.373578,0.337266,0.319287,0.079772,0.411871,0.385476,0.064891,0.0,0.067249,0.08094,0.070209,0.071093,0.065417,0.074399,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.554221,0.485963,0.473743,0.96915,0.124933,0.337215,0.397552,0.460018,0.461145,0.428971,0.415238,0.136376,0.487492,0.466174,0.104967,0.0,0.114345,0.14218,0.147219,0.132426,0.117061,0.140094,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0
75%,0.628799,0.501494,0.518114,0.98882,0.185548,0.397546,0.49476,0.558422,0.559451,0.527729,0.505651,0.212557,0.573047,0.555633,0.154507,0.034775,0.185485,0.225611,0.230205,0.193606,0.17322,0.210276,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Selection

In [17]:
cols_test_y1 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

cols_test_y2 = ['C1','C2', 'C3', 'C4', 'C5', 'C6','C7',	'T1',	'T2',	'T3',	'T4',	'T5', 
            'S1',	'S2',	'S3',	'C3_Prime',	'C5_Prime',	'C6_Prime', 'S2_Prime', 
            'S3_Prime',	'T3_Prime',	'T4_Prime',	'T5_Prime',	'industry_1',	'industry_2',	'industry_3',	'industry_4',	'industry_5']

dataset_y1 = df[cols_test_y1].copy()
dataset_y2 = df[cols_test_y2].copy()
target_y1 = df['Y1']
target_y2 = df['Y2']

#s1Prime causes us to have an error for some reason

## Y1 Columns

In [18]:
dataset_y1 = dataset_y1.values
target_y1 = target_y1.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y1, target_y1)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False  True False  True False False False False False
  True  True False False False  True False False False False False False
 False False False]
[ 8 10  1  2  1  9  1  6 13 21 15  7  1  1 20 12  4  1 22 14 19  5 17 18
 11  3 16]


## Y2

In [19]:
dataset_y2 = dataset_y2 = dataset_y2.values
target_y2 = target_y2.values

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(dataset_y2, target_y2)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False  True False False False False False False False
 False  True False False False  True False False False False False False
 False  True False  True]
[ 8  7  1 14  1 22  5 21 20 18 11 19  2  1 15 13  6  1  4  3 23 10 12 16
  9  1 17  1]


## Collumns we want to keep

In [20]:
cols_keep = ['C3', 'C5', 'C7', 'S1','S2', 'S2_Prime', 'C3','C5', 'S2','C6_Prime', 'industry_3', 'industry_5']
df_output = df[cols_keep].copy()
df_output['Y1']=df_y1.values
df_output['Y2']=df_y2.values
df_output.head()

Unnamed: 0,C3,C5,C7,S1,S2,S2_Prime,C3.1,C5.1,S2.1,C6_Prime,industry_3,industry_5,Y1,Y2
0,0.51039,0.981735,0.302839,0.100089,0.51813,0.10804,0.51039,0.981735,0.51813,0.042436,0,0,0,1
1,0.462999,0.893093,0.315374,0.164851,0.334307,0.186069,0.462999,0.893093,0.334307,0.0,0,0,1,0
2,0.448877,0.981494,0.319796,0.244401,0.575396,0.180035,0.448877,0.981494,0.575396,0.0,0,0,1,0
3,0.447968,0.940834,0.339174,0.365369,0.465152,0.233071,0.447968,0.940834,0.465152,0.0,0,0,1,1
4,0.496074,0.956793,0.388542,0.145683,0.431886,0.157879,0.496074,0.956793,0.431886,0.025445,1,0,0,1


In [21]:
df_output.to_csv('datasets2/RFE_L2_MM.csv', sep=',')