In [91]:
%reset -f
# import required libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import csv

In [92]:
# Read training and test set. df is a data frame containing the given train data set.
df =pd.read_csv('training.csv')


# See the number of samples and features
df_size = df.shape
print(df_size)

(734, 49)


In [93]:
# Pre-processing data

# make sure there is no NaN values
NaN_sum=0
for column in df:
    NaN_sum += df[column].isnull().values.sum()
print(NaN_sum)

0


In [94]:
# Separate target column from features: X containing samples with 47 features for
# each of them, y containing target values for each sample
X = df.drop('target',1)
y = df['target']
y

0      0
1      0
2      0
3      0
4      0
      ..
729    1
730    1
731    1
732    1
733    1
Name: target, Length: 734, dtype: int64

In [95]:
# As a rule of thumb: number of samples = (number of features)**2
# We need at least 47**2 = 2209 samples to handle 47 features.
# Since we will use a 5-fold cross-validation method, 80% of the samples (i.e. 578)  
# would construct our training set.
# By this number of samples, we can keep at most 24 features.
# Therefore, we have to follow some feature selection strategies

int(np.sqrt(587))

24

In [96]:
# Feature selection

# As the first strategy of feature selection, we try to remove redundant features 
# based on Pearson correlation coefficient of every paire of columns in our data. 
# Let's see what is the maximum correlation coefficient between columns:

corr_mat = X.corr(method='pearson').abs()
corr_mat_offdiag = corr_mat.mask(np.eye(corr_mat.shape[1], dtype = bool))
print(round(max(corr_mat_offdiag.max(axis='rows')),4))


0.7723


In [97]:
# Feature selection...

# As the max correlation value between columns is not in the range of (0.95,1.0), we cannot 
# remove features based on high-correlated concept.
# So we use Principal Component Analysis (PCA) for feature selection.

# Befor PCA, we need to normalize data to avoid the bias of scaling, as follows:
sc = StandardScaler()
X = sc.fit_transform(X)

# Now, we can apply PCA to select features:
pca = PCA(n_components=24)
X = pca.fit_transform(X)

In [98]:
# Cross Validation: in order to choose the proper classifier,
# we apply croos validation by the means of several methods:
# such as SVM, Random Forest,...

# Set the number of folding (splitting) of Cross-Validation to 5
cv = KFold(n_splits=5, random_state=1, shuffle=True)


# SVM
SVM_classifier = SVC(kernel='rbf', random_state = 1)
scores_SVM = cross_val_score(SVM_classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of SVM: %.3f (%.3f)' % (np.mean(scores_SVM), np.std(scores_SVM)))


# Logistic Regression
model = LogisticRegression()
scores_LgReg = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Logistic Regression: %.3f (%.3f)' % (np.mean(scores_LgReg), np.std(scores_LgReg)))


# Random Forest
model_RF = RandomForestClassifier()
scores_RF = cross_val_score(model_RF, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy of Random Forest: %.3f (%.3f)' % (np.mean(scores_RF), np.std(scores_RF)))




Accuracy of SVM: 0.742 (0.020)
Accuracy of Logistic Regression: 0.636 (0.038)
Accuracy of Random Forest: 0.713 (0.028)


In [99]:
# As SVM shows the highest accuracy, let's use SVM as our 
# binary classifier:
SVM_classifier.fit(X,y)

# Predict targets in test data
Test_df = pd.read_csv('test.csv')

X_test_non_labeled = sc.transform(Test_df)
X_test_non_labeled = pca.transform(X_test_non_labeled)

y_pred_delivered = SVM_classifier.predict(X_test_non_labeled)
print(y_pred_delivered.shape)
print(np.asarray(y_pred_delivered))

(200,)
[0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0
 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0
 1 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 1 0 1 1
 1 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1
 0 1 1 1 0 0 0 1 1 0 0 1 0 0 1]


In [100]:
pd.DataFrame(y_pred_delivered).to_csv('prediction.csv',header=False, index=False)

In [101]:
y_read = pd.read_csv('prediction.csv', header=None)

y_read

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1
...,...
195,0
196,1
197,0
198,0
