In [0]:
import numpy as np
import matplotlib as mplt
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import cluster
from sklearn import tree
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from google.colab import files
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

# uploaded=files.upload()
print("Reading in training set...")
training=pd.read_csv('training.txt', sep='\s+', header=None)
training.columns=['info_id', 'feature_id', 'value']
print(training)

Reading in training set...
        info_id  feature_id  value
0             1           6   1.00
1             1         160  31.00
2             1         438   1.00
3             1         479   1.00
4             1         618   1.00
...         ...         ...    ...
443398     1842       25905  65.00
443399     1842       25930  34.00
443400     1842       26083   0.78
443401     1842       26181   0.52
443402     1842       26184   1.00

[443403 rows x 3 columns]


In [0]:
print("\nReading in label values...")
label_train=pd.read_csv('label_training.txt', header=None)
label_train.columns=['label']
print(label_train)


Reading in label values...
      label
0         1
1         1
2         1
3         1
4         1
...     ...
1837     -1
1838     -1
1839      1
1840     -1
1841      1

[1842 rows x 1 columns]


In [0]:
print("Organizing training dataframe to be readable...\n")
matrix=pd.pivot_table(training, index="info_id", columns="feature_id", values="value")
matrix=matrix.fillna(0)
print(matrix)

Organizing training dataframe to be readable...

feature_id  2      3      4      5      ...  26357  26360  26362  26364
info_id                                 ...                            
1             0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
2             0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
3             0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
4             0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
5             0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
...           ...    ...    ...    ...  ...    ...    ...    ...    ...
1838          0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
1839          0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
1840          0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
1841          0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0
1842          0.0    0.0    0.0    0.0  ...    0.0    0.0    0.0    0.0

[1842 rows x 1

In [0]:
print("Adjusting the number of dimensions, fitting to model...\n")
pca=PCA(n_components=150)
model_fit=pca.fit_transform(matrix)
formatted_matrix=pd.DataFrame(data=model_fit)
print(formatted_matrix)

Adjusting the number of dimensions, fitting to model...

            0          1          2    ...       147       148       149
0    -34.286615 -12.721954  66.547243  ... -1.086992 -0.268492 -0.848150
1     23.901116 -22.975198 -68.809828  ... -0.786058 -0.752929 -0.086216
2     16.071592 -32.588397  37.704559  ... -0.883725 -0.267223 -0.474541
3      7.736907 -25.975390  37.202887  ... -0.985070  0.152938  0.144487
4      3.076045  -5.450483  -4.501322  ... -0.025268  0.059458 -0.643230
...         ...        ...        ...  ...       ...       ...       ...
1837  -9.597636 -20.443191  73.627654  ...  0.447043  0.469068 -0.362460
1838  41.171066  67.265584 -75.581830  ... -0.459863 -0.726671  0.308059
1839   1.354640 -17.356014  91.747598  ... -0.072619 -0.024446  0.333474
1840  21.551570  -0.757411  54.222044  ...  0.195649  0.949035 -0.579851
1841  55.425196  20.429191  25.267938  ...  0.905120 -0.331334  0.107831

[1842 rows x 150 columns]


In [0]:
print("Merging formatted matrix & label training data frame to label each item and its attributes...\n")
merged_matrix=pd.merge(matrix, label_train, left_index=True, right_index=True)
print(merged_matrix)

Merging formatted matrix & label training data frame to label each item and its attributes...

        2    3    4    5    6    8  ...  26356  26357  26360  26362  26364  label
1     0.0  0.0  0.0  0.0  1.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
2     0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
3     0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
4     0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
5     0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
...   ...  ...  ...  ...  ...  ...  ...    ...    ...    ...    ...    ...    ...
1837  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0     -1
1838  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0     -1
1839  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0      1
1840  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0    0.0    0.0    0.0    0.0     -1
184

In [58]:
print("Setting up our axes...")
training_set, testing_set=train_test_split(merged_matrix, test_size=.2)
x_train=training_set.drop(['label'], axis=1)
y_train=training_set['label']
x_test=testing_set.drop(['label'], axis=1)
y_test=testing_set['label']

Setting up our axes...


In [59]:
print("Using a Naive Bayes Classifier...")
gnb=GaussianNB()
prediction=gnb.fit(x_train,y_train).predict(x_test)
print("Number of mislabeled points: " + str(x_test.shape[0]))

Using a Naive Bayes Classifier...
Number of mislabeled points: 369


In [60]:
dec_tree=tree.DecisionTreeClassifier(criterion='entropy')
dec_tree=dec_tree.fit(x_train,y_train)
print(dec_tree)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [64]:
y_predict=dec_tree.predict(x_test)
prediction = pd.concat([pd.Series(y_test.values,name='Real Class'),pd.Series(y_predict,name='Expected Class')], axis=1)
print(prediction)

     Real Class  Expected Class
0            -1              -1
1            -1               1
2             1               1
3             1               1
4             1               1
..          ...             ...
364           1               1
365           1               1
366           1              -1
367           1               1
368          -1              -1

[369 rows x 2 columns]


In [0]:
training_items=training['info_id'].max()
training_features=training['feature_id'].max()

print("The number of data items is: " + str(training_items))
print("The maximum amount of features is: " + str(training_features))
print("\nSplitting up info_ids into separate objects...")

index=0
info_index=1

print("\n")
while info_index == training.info_id[index]:
  print(str(training.info_id[index]) + " " + str(training.feature_id[index]) + " " + str(training.value[index]))
  # df_add=pd.DataFr
  if(training.info_id[index+1] != info_index):
    #  info_index+=1
    print("Done!")
  index+=1

print(str(training.info_id[index]) + " " + str(training.feature_id[index]) + " " + str(training.value[index]))