# **BITS F464 - Semester 1 - MACHINE LEARNING**
--------------------------------------------------------------------------------

**ASSIGNMENT 1 - LINEAR MODELS FOR REGRESSION AND CLASSIFICATION**
--------------------------------------------------------------------------------
***Team number: 13***

---
***Team Members: ANIRUDH BAGALKOTKER, KARTIK PANDEY, ADWAIT KULKARNI, JOY SINHA, PIYUSH JAJRA***

---
***IDs: 2021A7PS2682H, 2021A7PS2574H, 2021A7PS2995H, 2021A8PS1606H, 2021B4A72969H***


This assignment aims to identify the differences between three sets of Machine Learning models.

# **_1. Dataset Generation_**

You are given a sample Diabetes dataset. Using this, please develop your own dataset consisting of 500 records. You can use the given code to generate your own dataset. Submit the generated dataset as a .csv file along with your python notebook.

In [None]:
import os
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset
import warnings
warnings.filterwarnings( "ignore" )

# Getting the current directory using os.path and loading the csv file with the sample diabetes dataset
folderName = os.getcwd()
datasets = load_csvs(folder_name=folderName)
real_data = datasets["diabetes"]

# Generating metadata for the sample dataset
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath=folderName + "/diabetes.csv")

# Visualizing the metadata and print it
real_data.head()
metadata.visualize()
print("\n")
print(metadata.to_dict())

# Initializing a SingleTablePreset object with the metadata and fitting the synthesizer and sampling with the real_data input.
synthesizer = SingleTablePreset(metadata, name="FAST_ML")
synthesizer.fit(data=real_data)

# Generating 500 rows of synthetic data using the synthesizer and saving it as a csv and the synthesizer as a pkl
rows = 500
synthetic_data = synthesizer.sample(num_rows=rows)
synthetic_data.to_csv("synthetic_diabetes.csv", index=False)
synthesizer.save("diabetes.pkl")
print("\nSynthetic data generated.\n")
print(synthetic_data.head())

# ***2. Preprocess and perform exploratory data analysis of the dataset obtained***

In [None]:
from sdv.evaluation.single_table import evaluate_quality
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing of the Synthetic Data

# Handle missing values (if any) by replacing them with the mean
synthetic_data.fillna(synthetic_data.mean(), inplace=True)

print("\nSynthetic data Preprocessed.\n")

# Exploratory Data Analysis of the Synthetic Data
print("\nEDA for Synthetic data.\n")

# Display the Outcomes and its mean
print(synthetic_data['Outcome'].value_counts())
print("\n")
print(synthetic_data.groupby('Outcome').mean())
print("\n")

# Display basic statistics
print(synthetic_data.describe())
print("\n")

# Check data types and missing values
print(synthetic_data.info())
print("\n")

# Calculate and visualize correlations between numeric columns using cluster maps and box plots using seaborn

# Calculate correlations
correlation_matrix = synthetic_data.corr()

# Plot clustermap
plt.figure(figsize=(10, 6))
sns.clustermap(correlation_matrix, cmap="RdBu", center=0, cbar=True, annot=True)
plt.title("Correlation Clustermap")
plt.show()

# Box plot
plt.figure(figsize=(10, 10))
sns.boxplot(x="Outcome", y="Glucose", data=synthetic_data)
plt.xlabel("Outcome")
plt.ylabel("Glucose")
plt.title("Box Plot of Glucose by Outcome")
plt.show()

# Evaluating the quality of the synthetic data using sdv
quality_report = evaluate_quality(real_data, synthetic_data, metadata)
quality_report.get_visualization("Column Shapes")

# Save the Synthetic Data and the Synthesizing Model after preprocessing and evaluation
synthetic_data.to_csv("synthetic_diabetes.csv", index=False)
synthesizer.save("diabetes.pkl")

# Separating the features and target
target = synthetic_data["Outcome"]
features = synthetic_data.drop(columns="Outcome", axis = 1)

# Normalization and Standardization

# Normalizing the data
# features = (features - features.min()) / (features.max() - features.min())

# Standardizing the data
features = (features - features.mean()) / features.std()

# Splitting the data into training (80%) and test (20%)
total_samples = len(features)
train_samples = int(0.8 * total_samples)

# Shuffle the indices to randomize the data
indices = np.arange(total_samples)
np.random.shuffle(indices)

# Split the indices into training and test sets
train_indices = indices[:train_samples]
test_indices = indices[train_samples:]

# Create training and test data
x_tr = features.iloc[train_indices]
y_tr = target.iloc[train_indices]
x_te = features.iloc[test_indices]
y_te = target.iloc[test_indices]

print("\nx_tr:\n",x_tr.head())
print("\ny_tr:\n",y_tr.head())
print("\nx_te:\n",x_te.head())
print("\ny_te:\n",y_te.head())

# Accuracy Score function
def accuracy_score(y_pred, y_true):
	"""
	The accuracy_score function takes in two arrays of labels and returns the fraction
	of time that they are equal. This is known as the accuracy score, or more commonly,
	the classification rate. The function can also take an optional third parameter to specify 
	a normalization method for when there are unequal numbers of predictions between classes.

	:param y_pred: Input the predicted values of y
	:param y_true: Pass in the actual labels of the data and y_pred is used to pass in our predicted labels
	:return: The fraction of correct predictions
	"""

	num_correct = np.sum(y_true == y_pred)
	num_total = len(y_true)

	return num_correct / num_total


# ***3. Comparison of Stochastic Gradient Descent and Batch Gradient Descent using Linear Regression***

## **_Stochastic Gradient Descent_**

## **_Batch Gradient Descent_**

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_4. Comparison of Lasso and Ridge Regression using Polynomial Regression_**

## **_Lasso Regression_**

## **_Ridge Regression_**

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_5. Comparison of Logistic Regression and Least Squares Classification_**

## **_Logistic Regression_**

In [None]:
class Logistic_Regression():
	

	# defining the constructor with learning rate and no of iterations (Hyperparameters)
	def __init__(self, learning_rate, no_of_iterations):
		"""
		The __init__ function is called when the class is instantiated.
		Sets up the initial values of all attributes, and it can also do any other setup that might be necessary for 
		your object to function properly.
		
		:param self: Represent the instance of the class
		:param learning_rate: Control how much the weights are adjusted each time
		:param no_of_iterations: Set the number of iterations for which we want to run the gradient descent algorithm
		:return: Nothing
		"""
        
		self.learning_rate = learning_rate
		self.no_of_iterations = no_of_iterations



    # model function to train the model with dataset
	def model(self, X, Y):
		"""
		The model function is used to train the model.
		It takes in two parameters: X and Y, which are numpy arrays/matrices of shape (m,n) and (m,1) respectively.
		The function updates the weights w and bias b using gradient descent algorithm.

		:param self: Represent the instance of the class
		:param X: Store the training data
		:param Y: Calculate the error and the weights
		:return: Nothing
		"""
		# number of data points(rows) = m and no of features(columns) = n
		self.m, self.n = X.shape

		# initializing the weights and bias to zero
		self.w = np.zeros(self.n)
		self.b = 0
		self.X = X
		self.Y = Y

		# implementing gradient descent for optimization
		for i in range(self.no_of_iterations):
			self.update_weights_and_bias()


	# function for updating the weights and bias using gradient descent
	def update_weights_and_bias(self):
		"""
		The update_weights_and_bias function updates the weights and bias using the gradient descent formula.
		The function takes in no arguments, but uses self.w, self.b, self.X and self.Y to update 
		the weights and bias.

		:param self: Represent the instance of the class
		:return: The updated weights and bias
		"""
		# weights are updated using the formula w := w - learning_rate * dw
		# bias is updated using the formula b := b - learning_rate * db

        # Y_hat formula (sigmoid function) = w.X + b
		Y_hat = 1 / (1 + np.exp(-(self.X.dot(self.w) + self.b)))
        
        # derivatives
		dw = (1/self.m)*np.dot(self.X.T, (Y_hat - self.Y))
		db = (1/self.m)*np.sum(Y_hat - self.Y)

		# updating the weights and bias using the gradient descent formula
		self.w = self.w - self.learning_rate * dw
		self.b = self.b - self.learning_rate * db



	# predict function to predict the output using Sigmoid Equation and Decision Boundary
	def predict(self, X):
		"""
		The predict function takes in a matrix of features and returns the predicted labels for each row.
		The predict function uses the sigmoid function to calculate Y_hat, which is then used to determine if 
		the label should be 1 or 0. If Y_hat > 0.5, then it is classified as 1; otherwise it is classified as 0.

		:param self: Represent the instance of the class
		:param X: Pass the input data to the model
		:return: The predicted values of y (vector) for the given x
		"""
		# predicting the output by checking Y_hat > 0.5 for 1 and Y_hat <= 0.5 for 0
		Y_pred = 1 / (1 + np.exp(-(X.dot(self.w) + self.b)))
		Y_pred = np.where(Y_pred > 0.5, 1, 0)
		return Y_pred

# Training the model
classifier = Logistic_Regression(learning_rate=0.01, no_of_iterations=1000)
classifier.model(x_tr, y_tr)

# Model Evaluation

# Model Evaluation for Training Data
x_train_predict = classifier.predict(x_tr)
train_data_accuracy = accuracy_score(x_train_predict, y_tr)

print("\nThe Accuracy Score of Training Data: ", train_data_accuracy)

# Model Evaluation for Test Data
x_test_predict = classifier.predict(x_te)
test_data_accuracy = accuracy_score(x_test_predict, y_te)

print("\nThe Accuracy Score of Test Data: ", test_data_accuracy)

## **_Least Squares Classification_**

In [None]:
class LMSClassifier:
    
    # defining the constructor with learning rate and no of iterations (Hyperparameters)
    def __init__(self, learning_rate, no_of_iterations):
        """
		The __init__ function is called when the class is instantiated.
		Sets up the initial values of all attributes, and it can also do any other setup that might be necessary for 
		your object to function properly.
		
		:param self: Represent the instance of the class
		:param learning_rate: Control how much the weights are adjusted each time
		:param no_of_iterations: Set the number of iterations for which we want to run the gradient descent algorithm
		:return: Nothing
		"""

        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations

    # model function to train the model with dataset
    def model(self, X, Y):
        """
		The model function is used to train the model.
		It takes in two parameters: X and Y, which are numpy arrays/matrices of shape (m,n) and (m,1) respectively.
		The function updates the weights w and bias b using gradient descent algorithm.

		:param self: Represent the instance of the class
		:param X: Store the training data
		:param Y: Calculate the error and the weights
		:return: Nothing
		"""
        
        no_samples, no_features = X.shape
        X = X.to_numpy()
        Y = Y.to_numpy()
        
        # Initializing array of ones
        X1 = np.ones(no_samples)
        X1 = X1.reshape(1,400)
        
        # Transform and concatenate columns of the input array X into the new array X1
        for j in range(no_features):
            x_i = X[:, j].reshape(-1, 1)
            x_i = x_i.T
            X1 = np.concatenate((X1, x_i), axis=0)
        
        Y = Y.reshape(-1, 1)
        X1 = X1.T
        
        # Compute the coefficients (beta) for linear regression using the pseudo-inverse method
        self.beta = np.linalg.pinv(X1.T @ X1) @ (X1.T @ Y)

    # predict function to predict the output using the coefficients (beta)
    def predict(self, X_test):
        """
        The predict function takes in a matrix of test data and returns the predicted values for each sample.
        The function first adds a column of ones to the test data, then multiplies it by beta to get Y_hat.
        It then reshapes Y_hat into an array with one row and no_samples columns, which is used to calculate mean. 
        Then it loops through all elements in Y_hat and sets them equal to 0 if they are less than mean or 1 otherwise.
        
        :param self: Represent the instance of the class
        :param X_test: Pass the test data to the predict function
        :return: A vector of predictions Y_pred given X
        """
        
        no_samples, no_features = X_test.shape
        X = np.concatenate((np.ones((no_samples, 1)), X_test), axis=1)
        Y_hat = X @ self.beta
        
        # Calculate mean of Y_hat
        Y_hat = Y_hat.reshape(no_samples, )
        mean = np.mean(Y_hat)
        Y_hat = Y_hat.reshape(1, no_samples)
        
		# predicting the output by checking Y_hat > mean for 1 and Y_hat <= mean for 0
        for j in range(no_samples):
            if Y_hat[0, j] > mean:
                Y_hat[0, j] = 1
            else:
                Y_hat[0, j] = 0
        Y_pred = Y_hat.reshape(no_samples, )
        return Y_pred

# Training the model
classifier = LMSClassifier(learning_rate=0.01, no_of_iterations=1000)
classifier.model(x_tr, y_tr)
classifier.beta

# Model Evaluation for Training Data
x_train_predict = classifier.predict(x_tr)
train_data_accuracy = accuracy_score(x_train_predict, y_tr)

print("\nThe Accuracy Score of Training Data: ", train_data_accuracy)

# Model Evaluation for Test Data
x_test_predict = classifier.predict(x_te)
test_data_accuracy = accuracy_score(x_test_predict, y_te)

print("\nThe Accuracy Score of Training Data: ", test_data_accuracy)

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_5. References_**

1.   SDV: https://docs.sdv.dev/sdv/
2.   Preprocessing: https://towardsdatascience.com/data-preprocessing-and-eda-for-data-science-50ba6ea65c0a
3.   Preprocessing for Missing Data using Pandas: https://pandas.pydata.org/docs/user_guide/missing_data.html
4.   EDA using Seaborn: https://www.analyticsvidhya.com/blog/2021/08/how-to-perform-exploratory-data-analysis-a-guide-for-beginners/
5.   Plotting Graphs using Seaborn: https://seaborn.pydata.org/


