In [53]:
import pandas as pd
import numpy as np
import random
import csv

import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

In [54]:
######### Create random Dataset to test and fit ML model ##########
Rows_number = 200
Article_ID = np.arange(1,(Rows_number+1),1)
Article_value = np.random.randint(201, size=Rows_number)
Article_trend = np.random.choice(["Software","Mobile","UI_UX","Machine_Learning","Web","FullStack"], Rows_number)

column_name = ['Article_ID']
  
# creating the dataframe
df = pd.DataFrame(data = Article_ID,  columns = column_name)
df['Article_value'] = Article_value
df['Article_trend'] = Article_trend
# displaying the dataframe
df.head(10) 


Unnamed: 0,Article_ID,Article_value,Article_trend
0,1,15,Machine_Learning
1,2,25,FullStack
2,3,97,UI_UX
3,4,87,FullStack
4,5,150,Web
5,6,29,Mobile
6,7,144,Software
7,8,58,Machine_Learning
8,9,48,Software
9,10,117,UI_UX


In [55]:
df.tail(5)

Unnamed: 0,Article_ID,Article_value,Article_trend
195,196,78,Software
196,197,186,FullStack
197,198,77,Web
198,199,140,Mobile
199,200,25,Mobile


In [56]:
print(len(df))

200


In [57]:
### Generate Class Column and add it to dataset

################   function to get class list values   #########
def get_class_values(df):
  class_list = []
  for i in range(Rows_number) :
    if (((df['Article_value'][i]) >= 80) and ((df['Article_value'][i]) <= 140)):
      class_list.append(3)
    elif (((df['Article_value'][i]) < 80) and ((df['Article_trend'][i]) in ["Software","Mobile"])) :
      class_list.append(2)
    elif (((df['Article_value'][i]) < 80) and ((df['Article_trend'][i]) in ["UI_UX","Web","FullStack"])) :
      class_list.append(1)
    elif (((df['Article_value'][i]) > 140) and ((df['Article_trend'][i]) in ["Software","Machine_Learning"])) : 
       class_list.append(5)
    elif (((df['Article_value'][i]) > 140) and ((df['Article_trend'][i]) in ["Web","FullStack"])) :
       class_list.append(4)
  return class_list

######################################################################################

Class_of_Article = get_class_values(df)
print(len(Class_of_Article))
## add Class Column to dataset
df['Class'] = Class_of_Article
df.head(10)





170


ValueError: ignored

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#### check non missed values NAN
df.isnull().sum()

In [None]:
# to know count of all columns according to Class column
df['Class'].value_counts()

In [None]:
df.groupby(['Class']).mean()

In [None]:
# Split the data into features and target label
features = df.drop(['Class','Student_ID'], axis = 1)
target = df['Class']
print("The Features\n",features)
print("The Target\n",target)

# Data Preprocessing
### 1- Data Normalized for Sensor_value Column is continuous max 200 
*  Using a logarithmic transformation then
*  MinMaxScaler to normalized data from 0 to 1

In [None]:
skewed = ['Sensor_value']
features_log_transformed = pd.DataFrame(data = features)
features_log_transformed[skewed] = features[skewed].apply(lambda x: np.log(x + 1))### +1 because log0 not define


In [None]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['Sensor_value']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

features_log_minmax_transform


### 2- Using one-hot encoding for Sensor_trend Column


|   | someFeature |                    | someFeature_A | someFeature_B | someFeature_C |
| :-: | :-: |                            | :-: | :-: | :-: |
| 0 |  B  |  | 0 | 1 | 0 |
| 1 |  C  | ----> one-hot encode ----> | 0 | 0 | 1 |
| 2 |  A  |  | 1 | 0 | 0 |


In [None]:
# one-hot encoding
features_final = pd.get_dummies(features_log_minmax_transform)  

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)

print("{} total features after one-hot encoding.".format(len(encoded)))

features_final.head()


# Final Data Features after processing

In [None]:
# Split the data into features and target label
X = features_final
y = target
print("The Features\n",X)
print("The Target\n",y)

In [None]:
### convert random dataset to csv 
### save it to my desktop or cloud
df.to_csv (r'C:\Users\Gamal\Desktop\data_csv.csv', index = False, header=True)



In [None]:
# Import train_test_split
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

print(y_train.shape[0], y_test.shape[0])

In [None]:
clf = DecisionTreeClassifier(random_state=25, max_depth=5)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
acc = accuracy_score(y_test, y_predict)

print(f'The Accuracy of ML Algorithm = {acc*100} %')

In [None]:
input_data = (0.94,0,0,0,0,0,1,0)
### changing the input data to a Numpy array
input_data_as_numpy_array = np.asarray(input_data)

### reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction_values = clf.predict(input_data_reshaped)
print(prediction_values)
print(type(prediction_values))

In [59]:
########  Plot  ################################
pca = PCA(2)
X_projected = pca.fit_transform(X)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1] 

fig = plt.figure()  
plt.scatter(x1, x2, c=y, alpha=0.8, cmap="viridis")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.colorbar()

## Show plot
pyplot(fig)


NameError: ignored