# __Machine Learning Project__

# __BY :__ 
#            _Belal Khaled_

# PART 1: Data Visualization


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data1 = pd.read_csv("housingdata5.csv")

print(data1.info())
print('----------------------------------------------------------------------')
print(data1.describe())

In [None]:
#Visualize data using scatter plot matrix
pd.plotting.scatter_matrix(data1,figsize = (26,24),diagonal = '')
plt.show()

In [None]:
data1.corr()

In [None]:
data1.plot(kind="scatter", x="longitude", y="latitude")
print("Strong to Very Strong Negative Correlation")
print("Negative Linear Relationship")

In [None]:
data1.plot(kind="scatter", x="households", y="total_rooms")
print("Strong to Very Strong Positive Correlation")
print("Positive Linear Relationship")

In [None]:
data1.plot(kind="scatter", x="median_house_value", y="median_income")
print("Moderate Positive Correlation")

# PART 2: Regression


In [None]:
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

data2 = pd.read_csv("housingdata5.csv")

X = data2[['housing_median_age','total_rooms','population','households','median_income']]
y = data2['median_house_value']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# __Forward Selection Strategy__

In [None]:
# Initialize an empty list to store selected features and create an empty dictionary to store model performance
selected_features = []
model_performance = {}

# Loop through each feature
for feature in X_train.columns:
    # Add the feature to the selected features list
    selected_features.append(feature)
    # Train the model using the selected features
    model = linear_model.LinearRegression()
    model.fit(X_train[selected_features], y_train)
    # Make predictions on the test set
    y_pred = model.predict(X_test[selected_features])
    # Calculate the R-squared score and mean squared error
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    # Store the model performance in the dictionary
    model_performance[", ".join(selected_features)] = (r2, mse)
    # Remove the last added feature if it does not improve the model performance
    if len(selected_features) > 1 and r2 < max(model_performance.values())[0]:
        selected_features.pop()

# R2 Score And Mean Squared Error For Each Model. 
# Then Identifying The Best Model :

In [None]:
# Initialize variables to store the best model's information
best_features = ""
best_r2 = 0
best_mse = 1000
                 

# Loop through each model performance
for features, performance in model_performance.items():
    # Print the selected features and their corresponding model performance
    print("Selected Features:", features)
    print("R-squared Score:", performance[0])
    print("Mean Squared Error:", performance[1])
    print('-------------------------------------------------------------------------------------------')

    # Check if the current model has better performance than the current best model
    if performance[0] > best_r2:
        best_features = features
        best_r2 = performance[0]
        best_mse = performance[1]


# Print the best model's information
print("Best Model is:")
print("Selected Features:", best_features)
print("R-squared Score:", best_r2)
print("Mean Squared Error:", best_mse)



In [None]:
model_performance

# PART 3: Classification


## 1. SVM 

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import dataset:
dataset = pd.read_csv("online_shoppers_intention_Dataset 5.csv")

# Print dataset information
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [None]:
print(dataset.shape)
dataset.head()

In [None]:
#We will delete an unnecessary attribute like the month column which has a data type object.
dataset = dataset.drop(['Month'], axis= 1)
print(dataset.shape)
dataset.head()

# Data Preprocessing:


### Handling Missing Data:


In [None]:
# Check for null values in data
nullcount = dataset.isnull().sum()
print('Total number of null values in dataset:', nullcount.sum())

### Converting String Value To Int Type:


In [2]:
dataset['VisitorType'] = LabelEncoder().fit_transform(dataset['VisitorType'])
dataset['Weekend'] =  LabelEncoder().fit_transform(dataset['Weekend'])
dataset['Revenue'] =  LabelEncoder().fit_transform(dataset['Revenue'])

dataset.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,12,364.75,0.025,0.025,0.0,0.0,Nov,2,6,3,2,2,0,0
1,7,56.3,0,0.0,15,315.4,0.0,0.011111,0.0,0.0,Oct,2,2,1,2,0,0,0
2,0,0.0,0,0.0,20,457.583333,0.022222,0.044444,0.0,0.0,May,2,2,9,4,2,1,0
3,0,0.0,0,0.0,11,689.991667,0.0,0.026667,0.0,0.0,Nov,3,2,3,6,2,0,0
4,2,196.6,0,0.0,31,1514.3,0.036364,0.066667,0.0,0.0,Aug,1,8,7,1,2,1,0


# Separating Features And Labels:

In [None]:
#Assign values to the X and y variables:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
print(X.shape)
X.head()

In [None]:
y.head()

# Data Standardization:

In [None]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 10-Fold Cross Validation:

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict

svc = SVC(kernel='linear')
scores = cross_val_score(svc, X, y, cv=10)

# Perform cross-validation predictions
y_pred = cross_val_predict(svc, X, y, cv=10)

# Compute and print confusion matrix
cm = confusion_matrix(y, y_pred)

print('Confusion_Matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y, y_pred)) 
print('-------------------------------------------------------')
print("Score:", scores)
print("Average Score:", scores.mean())
print('-------------------------------------------------------')

#  Splitting Dataset Into Training Set And Testing Set:

In [None]:
# Split dataset into training set and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Hold-Out Method:

In [None]:
#Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

In [None]:
# Train the SVM classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

# Print results: 
print('confusion_matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y_test, y_pred)) 
print('-------------------------------------------------------')
print("accuracy=", accuracy_score(y_test, y_pred))

# Decision tree:

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import dataset:
dataset = pd.read_csv("online_shoppers_intention_Dataset 5.csv")

# Print dataset information
print(dataset.info())

In [None]:
print(dataset.shape)
dataset.head()

In [None]:
#We will delete an unnecessary attribute like the month column which has a data type object.
dataset = dataset.drop(['Month'], axis= 1)
print(dataset.shape)
dataset.head()

# Data Preprocessing:

### Handling Missing Data:

In [None]:
# Check for null values in data
nullcount = dataset.isnull().sum()
print('Total number of null values in dataset:', nullcount.sum())

### Converting String Value To Int Type:

In [None]:
#Converting String Value To Int Type:
dataset['VisitorType'] = LabelEncoder().fit_transform(dataset['VisitorType'])
dataset['Weekend'] =  LabelEncoder().fit_transform(dataset['Weekend'])
dataset['Revenue'] =  LabelEncoder().fit_transform(dataset['Revenue'])

dataset.head()

# Separating Features And Labels:

In [None]:
#Assign values to the X and y variables:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

print(X.shape)
X.head()

y.head()

# Data Standardization:

In [None]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 10-Fold Cross Validation:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict

# Create a Decision Tree classifier
classifier = DecisionTreeClassifier()

# Perform 10-fold cross-validation
scores = cross_val_score(classifier, X, y, cv=10)

# Perform cross-validation predictions
y_pred = cross_val_predict(classifier, X, y, cv=10)

# Compute and print confusion matrix
cm = confusion_matrix(y, y_pred)

print('Confusion_Matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y, y_pred)) 
print('-------------------------------------------------------')
print("Score:", scores)
print("Average Score:", scores.mean())
print('-------------------------------------------------------')

#  Splitting Dataset Into Training Set And Testing Set:

In [None]:
# Split dataset into training set and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Hold-Out Method:

In [None]:
# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

# Use the DT classifier to fit data:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train) 

# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

# Compute and print confusion matrix
cm = confusion_matrix(y_test, y_predict)

print('Confusion_Matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y_test, y_predict)) 
print('-------------------------------------------------------')
print("accuracy=", accuracy_score(y_test, y_predict))

# KNN:

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import dataset:
dataset = pd.read_csv("online_shoppers_intention_Dataset 5.csv")

# Print dataset information
print(dataset.info())

In [None]:
print(dataset.shape)
dataset.head()

In [None]:
#We will delete an unnecessary attribute like the month column which has a data type object.
dataset = dataset.drop(['Month'], axis= 1)
print(dataset.shape)
dataset.head()

# Data Preprocessing:

### Handling Missing Data:

In [None]:
# Check for null values in data
nullcount = dataset.isnull().sum()
print('Total number of null values in dataset:', nullcount.sum())

### Converting String Value To Int Type:


In [None]:
#Converting String Value To Int Type:
dataset['VisitorType'] = LabelEncoder().fit_transform(dataset['VisitorType'])
dataset['Weekend'] =  LabelEncoder().fit_transform(dataset['Weekend'])
dataset['Revenue'] =  LabelEncoder().fit_transform(dataset['Revenue'])

dataset.head()

# Separating Features And Labels:

In [None]:
#Assign values to the X and y variables:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

print(X.shape)
X.head()

y.head()

# Data Standardization:

In [None]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 10-Fold Cross Validation:


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict

# Create K-nearest neighbors classifier
classifier = KNeighborsClassifier()

# Perform 10-fold cross-validation
scores = cross_val_score(classifier, X, y, cv=10)

# Perform cross-validation predictions
y_pred = cross_val_predict(classifier, X, y, cv=10)

cm = confusion_matrix(y, y_pred)

print('Confusion_Matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y, y_pred)) 
print('-------------------------------------------------------')
print("Score:", scores)
print("Average Score:", scores.mean())
print('-------------------------------------------------------')

#  Splitting Dataset Into Training Set And Testing Set:

In [None]:
# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

# Hold-Out Method:

In [None]:
# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train) 

# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

# Compute and print confusion matrix
cm = confusion_matrix(y_test, y_predict)

print('Confusion_Matrix\n\n', cm)
print('\nTrue Positives(TP) = ' , cm[0,0])
print('\nTrue Negatives(TN) = ' , cm[1,1])
print('\nFalse Positives(FP) = ' , cm[0,1])
print('\nFalse Negatives(FN) = ' , cm[1,0])
print('-------------------------------------------------------')
print(classification_report(y_test, y_predict)) 
print('-------------------------------------------------------')
print("accuracy=", accuracy_score(y_test, y_predict))