# **BITS F464 - Semester 1 - MACHINE LEARNING**
--------------------------------------------------------------------------------

**ASSIGNMENT 1 - LINEAR MODELS FOR REGRESSION AND CLASSIFICATION**
--------------------------------------------------------------------------------
***Team number: 13***

---
***Team Members: ANIRUDH BAGALKOTKER, KARTIK PANDEY, ADWAIT KULKARNI, JOY SINHA, PIYUSH JAJRA***

---
***IDs: 2021A7PS2682H, 2021A7PS2574H, 2021A7PS2995H, 2021A8PS1606H, 2021B4A72969H***


This assignment aims to identify the differences between three sets of Machine Learning models.

# **_1. Dataset Generation_**

You are given a sample Diabetes dataset. Using this, please develop your own dataset consisting of 500 records. You can use the given code to generate your own dataset. Submit the generated dataset as a .csv file along with your python notebook.

In [None]:
import os
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset
import warnings
warnings.filterwarnings( "ignore" )

# Getting the current directory using os.path and loading the csv file with the sample diabetes dataset
folderName = os.path.abspath(os.path.dirname(__file__)) 
datasets = load_csvs(folder_name=folderName)
real_data = datasets["diabetes"]

# Generating metadata for the sample dataset
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath=folderName + "/diabetes.csv")

# Visualizing the metadata and print it
real_data.head()
metadata.visualize()
print("\n")
print(metadata.to_dict())

# Initializing a SingleTablePreset object with the metadata and fitting the synthesizer and sampling with the real_data input.
synthesizer = SingleTablePreset(metadata, name="FAST_ML")
synthesizer.fit(data=real_data)

# Generating 500 rows of synthetic data using the synthesizer and saving it as a csv and the synthesizer as a pkl
rows = 500
synthetic_data = synthesizer.sample(num_rows=rows)
synthetic_data.to_csv("synthetic_diabetes.csv", index=False)
synthesizer.save("diabetes.pkl")
print("\nSynthetic data generated.\n")
print(synthetic_data.head())

# ***2. Preprocess and perform exploratory data analysis of the dataset obtained***

In [None]:
from sdv.evaluation.single_table import evaluate_quality
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the synthetic dataset
synthetic_data = pd.read_csv("synthetic_diabetes.csv")

# Preprocessing of the Synthetic Data

# Handle missing values (if any) by replacing them with the mean
synthetic_data.fillna(synthetic_data.mean(), inplace=True)

# Normalizing the data
# for column in synthetic_data:
#     synthetic_data[column] = (synthetic_data[column] - synthetic_data[column].min()) / (
#         synthetic_data[column].max() - synthetic_data[column].min()
#     )

print("\nSynthetic data Preprocessed.\n")

# Exploratory Data Analysis of the Synthetic Data
print("\nEDA for Synthetic data.\n")

# Display basic statistics
print(synthetic_data.describe())
print("\n")

# Check data types and missing values
print(synthetic_data.info())
print("\n")

# Calculate and visualize correlations between numeric columns using cluster maps and box plots using seaborn
plt.figure(figsize=(10, 6))
sns.heatmap(synthetic_data.corr(), cmap="RdBu", center=0, cbar=True, annot=True)
sns.clustermap(synthetic_data.corr(), cmap="RdBu", center=0, cbar=True, annot=True)
plt.show()

plt.figure(figsize=(10, 10))
sns.boxplot(x="Outcome", y="Glucose", data=synthetic_data)
plt.xlabel("Outcome")
plt.ylabel("Glucose")
plt.show()

# Evaluating the quality of the synthetic data using sdv
quality_report = evaluate_quality(real_data, synthetic_data, metadata)
quality_report.get_visualization("Column Shapes")

# Save the Synthetic Data and the Synthesizing Model after preprocessing and evaluation
synthetic_data.to_csv("synthetic_diabetes.csv", index=False)
synthesizer.save("diabetes.pkl")

# ***3. Comparison of Stochastic Gradient Descent and Batch Gradient Descent using Linear Regression***

## **_Stochastic Gradient Descent_**

## **_Batch Gradient Descent_**

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_4. Comparison of Lasso and Ridge Regression using Polynomial Regression_**

## **_Lasso Regression_**

## **_Ridge Regression_**

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_5. Comparison of Logistic Regression and Least Squares Classification_**

## **_Logistic Regression_**

In [None]:
import numpy as np

def sigmoid(x):
	return 1/(1+np.exp(-x))

def square_loss(y, t):
    return np.mean(pow((y - t),2))

# Driver code
def main() :
	df = pd.read_csv("synthetic_diabetes.csv")
	testRows = rows/100*75
	test = df.sample(testRows)
	train = df[~df.isin(test)]
	train.dropna(inplace = True)
	X_te, Y_te = test.iloc[:,:-1].values, test.iloc[:,-1:].values
	X_tr, Y_tr = train.iloc[:,:-1].values, train.iloc[:,-1:].values
	

if __name__ == "__main__" :     
    main()


## **_Least Squares Classification_**

## **_Insights drawn (plots, markdown explanations)_**

In [None]:
#plot a graph using any python lib (matplotlib, plotly etc..)

<!-- Explain your model Implementation using mathematical formulas and algorithms -->

# **_5. References_**

1.   SDV: https://docs.sdv.dev/sdv/
2.   Preprocessing: https://towardsdatascience.com/data-preprocessing-and-eda-for-data-science-50ba6ea65c0a
3.   Preprocessing for Missing Data using Pandas: https://pandas.pydata.org/docs/user_guide/missing_data.html
4.   EDA using Seaborn: https://www.analyticsvidhya.com/blog/2021/08/how-to-perform-exploratory-data-analysis-a-guide-for-beginners/
5.   Plotting Graphs using Seaborn: https://seaborn.pydata.org/


