In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle


# Alzheimers Data

In [2]:
# Load the CSV file into a DataFrame
alzheimers_data = pd.read_csv('/Users/Student/Library/Mobile Documents/com~apple~CloudDocs/SETU/Sem 2/Data Analytics and Algorithms/Assignment/Implementation/Data-Science-Implementation/Code/alzheimers_cleaned_data.csv')
alzheimers_data.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [3]:
# Identify numerical columns
numerical_columns = alzheimers_data.select_dtypes(include=['float64', 'int64']).columns

# Identify categorical columns
categorical_columns = alzheimers_data.select_dtypes(include=['object', 'category']).columns

In [4]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Normalize the numerical columns
alzheimers_data[numerical_columns] = scaler.fit_transform(alzheimers_data[numerical_columns])

# Display the first few rows of the normalized data
alzheimers_data.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Diagnosis
0,Spain,1.389398,Male,-1.474299,1.305314,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,0.002719,Male,-0.432087,0.654689,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,1.081247,Female,1.652339,-0.814465,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,-1.460997,Male,1.304935,0.927532,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,-1.075808,Female,-1.126895,0.675677,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No


In [5]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    alzheimers_data[column] = label_encoder.fit_transform(alzheimers_data[column])

# Display the first few rows of the encoded data
alzheimers_data.head()

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Diagnosis
0,16,1.389398,1,-1.474299,1.305314,2,2,1,0,0,...,1,0,1,1,0,1,2,0,1,0
1,0,0.002719,1,-0.432087,0.654689,2,1,0,0,0,...,1,2,2,2,0,0,1,0,1,0
2,14,1.081247,0,1.652339,-0.814465,0,0,1,0,1,...,0,2,0,1,0,1,2,0,0,0
3,4,-1.460997,1,1.304935,0.927532,1,2,2,1,0,...,1,2,1,1,0,0,2,1,0,0
4,17,-1.075808,0,-1.126895,0.675677,0,1,0,1,0,...,2,0,0,0,0,1,2,0,0,0


In [6]:
# Save the processed data to a CSV file
alzheimers_data.to_csv('alzheimers_processed.csv', index=False)

In [7]:
# Split the data into training (70%) and remaining (30%)
train_data,remaining_data = train_test_split(alzheimers_data, test_size=0.3, random_state=42)

# Split the remaining data into test (50% of remaining) and validation (50% of remaining)
test_data,validation_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Print the sizes of the splits
print(f"Training data size: {train_data.shape[0]}")
print(f"Test data size: {test_data.shape[0]}")
print(f"Validation data size: {validation_data.shape[0]}")

Training data size: 51998
Test data size: 11142
Validation data size: 11143


In [8]:
# Define the features and target variable for training
features = train_data.drop(columns=['Diagnosis'])
target = train_data['Diagnosis']

# Define the features and target variable for testing
test_features = test_data.drop(columns=['Diagnosis'])
test_target = test_data['Diagnosis']

# Define the features and target variable for validation
validation_features = validation_data.drop(columns=['Diagnosis'])
validation_target = validation_data['Diagnosis']

In [9]:
# Save the features
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)

# Save the target
with open('target.pkl', 'wb') as f:
    pickle.dump(target, f)

# Save the test features
with open('test_features.pkl', 'wb') as f:
    pickle.dump(test_features, f)

# Save the test target
with open('test_target.pkl', 'wb') as f:
    pickle.dump(test_target, f)

# Save the validation features
with open('validation_features.pkl', 'wb') as f:
    pickle.dump(validation_features, f)

# Save the validation target
with open('validation_target.pkl', 'wb') as f:
    pickle.dump(validation_target, f)

# Student Performance Data

In [10]:

# Load the student performance CSV file into a DataFrame
student_performance_data = pd.read_csv('/Users/Student/Library/Mobile Documents/com~apple~CloudDocs/SETU/Sem 2/Data Analytics and Algorithms/Assignment/Implementation/Data-Science-Implementation/Code/student_performance_cleaned_data.csv')
student_performance_data.head()


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [11]:
# Identify numerical columns in student_performance_data
student_numerical_columns = student_performance_data.select_dtypes(include=['float64', 'int64']).columns

# Identify categorical columns in student_performance_data
student_categorical_columns = student_performance_data.select_dtypes(include=['object', 'category']).columns

print("Numerical columns in student_performance_data:", student_numerical_columns)
print("Categorical columns in student_performance_data:", student_categorical_columns)

Numerical columns in student_performance_data: Index(['Hours Studied', 'Previous Scores', 'Sleep Hours',
       'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')
Categorical columns in student_performance_data: Index(['Extracurricular Activities'], dtype='object')


In [12]:
# Normalize the numerical columns in student_performance_data
student_performance_data[student_numerical_columns] = scaler.fit_transform(student_performance_data[student_numerical_columns])

# Display the first few rows of the normalized data
student_performance_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,0.775188,1.704176,Yes,1.456205,-1.249754,1.862167
1,-0.383481,0.723913,No,-1.492294,-0.900982,0.508818
2,1.16141,-1.063626,Yes,0.276805,-0.900982,-0.53222
3,0.002742,-1.005963,Yes,-0.902594,-0.900982,-1.000687
4,0.775188,0.320275,No,0.866505,0.145333,0.56087


In [17]:
# Apply LabelEncoder to each categorical column in student_performance_data
for column in student_categorical_columns:
    student_performance_data[column] = label_encoder.fit_transform(student_performance_data[column])

# Display the first few rows of the encoded data
student_performance_data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,0.775188,1.704176,1,1.456205,-1.249754,1.862167
1,-0.383481,0.723913,0,-1.492294,-0.900982,0.508818
2,1.16141,-1.063626,1,0.276805,-0.900982,-0.53222
3,0.002742,-1.005963,1,-0.902594,-0.900982,-1.000687
4,0.775188,0.320275,0,0.866505,0.145333,0.56087


In [19]:
# Split the data into training (70%) and remaining (30%)
train_data, remaining_data = train_test_split(student_performance_data, test_size=0.3, random_state=42)

# Split the remaining data into test (50% of remaining) and validation (50% of remaining)
test_data, validation_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

# Print the sizes of the splits
print(f"Training data size: {train_data.shape[0]}")
print(f"Test data size: {test_data.shape[0]}")
print(f"Validation data size: {validation_data.shape[0]}")

Training data size: 7000
Test data size: 1500
Validation data size: 1500


In [20]:
# Define the features and target variable for training
train_features = train_data.drop(columns=['Performance Index'])
train_target = train_data['Performance Index']

# Define the features and target variable for testing
test_features = test_data.drop(columns=['Performance Index'])
test_target = test_data['Performance Index']

# Define the features and target variable for validation
validation_features = validation_data.drop(columns=['Performance Index'])
validation_target = validation_data['Performance Index']

In [21]:
# Save the training features
with open('student_train_features.pkl', 'wb') as f:
    pickle.dump(train_features, f)

# Save the training target
with open('student_train_target.pkl', 'wb') as f:
    pickle.dump(train_target, f)

# Save the test features
with open('student_test_features.pkl', 'wb') as f:
    pickle.dump(test_features, f)

# Save the test target
with open('student_test_target.pkl', 'wb') as f:
    pickle.dump(test_target, f)

# Save the validation features
with open('student_validation_features.pkl', 'wb') as f:
    pickle.dump(validation_features, f)

# Save the validation target
with open('student_validation_target.pkl', 'wb') as f:
    pickle.dump(validation_target, f)