#BARKAVI A/P P CHEVEN (23093149)
## WQD7012 Tutorial 3 Applied Machine Learning by Dr Riyaz

Part 1: Data Cleaning

In [1]:
 import pandas as pd
 import numpy as np

In [2]:
 # Sample dataset with missing values and duplicates
 data1 = pd.DataFrame({
 'ID': [1, 2, 3, 4, 4],
 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'David'],
 'Age': [25, 30, np.nan, 40, 40],
 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Houston']
 })
 print("Original Data:")
 print(data1)

Original Data:
   ID     Name   Age         City
0   1    Alice  25.0     New York
1   2      Bob  30.0  Los Angeles
2   3  Charlie   NaN      Chicago
3   4    David  40.0      Houston
4   4    David  40.0      Houston


In [6]:
# Remove duplicate rows
data1 = data1.drop_duplicates()
print(data1)

   ID     Name   Age         City
0   1    Alice  25.0     New York
1   2      Bob  30.0  Los Angeles
2   3  Charlie   NaN      Chicago
3   4    David  40.0      Houston


In [7]:
 # Fill missing values in 'Age' with the column mean
 data1['Age'].fillna(data1['Age'].mean(), inplace=True)
 print("\nCleaned Data:")
 print(data1)


Cleaned Data:
   ID     Name        Age         City
0   1    Alice  25.000000     New York
1   2      Bob  30.000000  Los Angeles
2   3  Charlie  31.666667      Chicago
3   4    David  40.000000      Houston


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data1['Age'].fillna(data1['Age'].mean(), inplace=True)


In [10]:
# Fill missing values in 'Age' with the column mean
data1.fillna({'Age': data1['Age'].mean()}, inplace=True)
print("\nCleaned Data:")
print(data1)


Cleaned Data:
   ID     Name        Age         City
0   1    Alice  25.000000     New York
1   2      Bob  30.000000  Los Angeles
2   3  Charlie  31.666667      Chicago
3   4    David  40.000000      Houston


In [17]:
# Example
import pandas as pd
import numpy as np
datala = pd.DataFrame({
 'ID': [1, 2, 3, 4, 4],
 'Name': ['Cheven', 'Ranjitham', 'Barkavi', 'Sharmini', 'Sharmini'],
 'Age': [56, 56, np.nan, 22, 22],
 'City': ['Canada', 'Canada', 'France', 'America', 'America']
 })

print("Original Data:")
print(datala)

Original Data:
   ID       Name   Age     City
0   1     Cheven  56.0   Canada
1   2  Ranjitham  56.0   Canada
2   3    Barkavi   NaN   France
3   4   Sharmini  22.0  America
4   4   Sharmini  22.0  America


In [18]:
# To remove duplicate rows
datala = datala.drop_duplicates()
print(datala)


   ID       Name   Age     City
0   1     Cheven  56.0   Canada
1   2  Ranjitham  56.0   Canada
2   3    Barkavi   NaN   France
3   4   Sharmini  22.0  America


In [19]:
# Fill missing values in 'Age' with the column mean
datala.fillna({'Age': datala['Age'].mean()}, inplace=True)
print("\nCleaned Data:")
print(datala)


Cleaned Data:
   ID       Name        Age     City
0   1     Cheven  56.000000   Canada
1   2  Ranjitham  56.000000   Canada
2   3    Barkavi  44.666667   France
3   4   Sharmini  22.000000  America


Part 2: Data Integration

In [41]:
# Sample datasets
data1 = pd.DataFrame({
 'ID': [1, 2, 3, 4, 4],
 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'David'],
 'Age': [25, 30, np.nan, 40, 40],
 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Houston']
 })
data1 = data1.drop_duplicates()
data1.fillna({'Age': data1['Age'].mean()}, inplace=True)

data2 = pd.DataFrame({
 'ID': [1, 2, 3, 5],
 'Income': [50000, 60000, 55000, 45000],
 'Gender': ['F', 'M', 'M', 'F']
 })

print('Datasets:')
print(data1)
print('-------------------------------------')
print('')
print(data2)

Datasets:
   ID     Name        Age         City
0   1    Alice  25.000000     New York
1   2      Bob  30.000000  Los Angeles
2   3  Charlie  31.666667      Chicago
3   4    David  40.000000      Houston
-------------------------------------

   ID  Income Gender
0   1   50000      F
1   2   60000      M
2   3   55000      M
3   5   45000      F


In [42]:
 # Merge datasets on 'ID'
 merged_data = pd.merge(data1, data2, on='ID', how='left')
 print("Merged Data:")
 print(merged_data)

Merged Data:
   ID     Name        Age         City   Income Gender
0   1    Alice  25.000000     New York  50000.0      F
1   2      Bob  30.000000  Los Angeles  60000.0      M
2   3  Charlie  31.666667      Chicago  55000.0      M
3   4    David  40.000000      Houston      NaN    NaN


Part 3: Data Transformation

In [43]:
# Encode categorical variable 'Gender' using one-hot encoding
merged_data = pd.get_dummies(merged_data, columns=['Gender'], drop_first=True)

In [46]:
import pandas as pd
# Normalize 'Income' column using Min-Max scaling
merged_data['Income_Normalized'] = (
    (merged_data['Income'] - merged_data['Income'].min()) /
    (merged_data['Income'].max() - merged_data['Income'].min())
)

print("Transformed Data:")
print(merged_data)

Transformed Data:
   ID     Name        Age         City   Income  Gender_M  Income_Normalized
0   1    Alice  25.000000     New York  50000.0     False                0.0
1   2      Bob  30.000000  Los Angeles  60000.0      True                1.0
2   3  Charlie  31.666667      Chicago  55000.0      True                0.5
3   4    David  40.000000      Houston      NaN     False                NaN


Part 4: Data Reduction

In [47]:
 # Drop columns that are not useful for analysis
 reduced_data = merged_data.drop(columns=['City', 'Income'])

In [48]:
 print("Reduced Final Data:")
 print(reduced_data)

Reduced Final Data:
   ID     Name        Age  Gender_M  Income_Normalized
0   1    Alice  25.000000     False                0.0
1   2      Bob  30.000000      True                1.0
2   3  Charlie  31.666667      True                0.5
3   4    David  40.000000     False                NaN


Part 5:  Upload the Excel File to Google Colab

In [50]:
 from google.colab import files
 uploaded = files.upload()

Saving student_data.xlsx to student_data.xlsx


In [51]:
 import pandas as pd
 # Load the Excel file into a DataFrame
 df = pd.read_excel("student_data.xlsx")

 # Display the data
 print(df)

     StudentID      Name   Age Gender Grade                             Email  \
0         1000     Jerry   NaN  Other     C                   zhill@gmail.com   
1         1001     Erica   NaN    NaN     D                 yayala@walker.com   
2         1002    Ashley   NaN      F     C                 brian89@gmail.com   
3         1003   Michael   NaN  Other     A          hickmanrebecca@yahoo.com   
4         1004    Eileen  20.0  Other     B  michaelbaker@fernandez-davis.net   
..         ...       ...   ...    ...   ...                               ...   
100       1022  Danielle  18.0      F   NaN          delacruzbarry@montes.net   
101       1080      Jack   NaN    NaN   NaN          ywatkins@kirk-peters.com   
102       1053    Amanda  24.0      F     D             juliebailey@yahoo.com   
103       1061    Justin   NaN      F     F               bryanking@quinn.com   
104       1072   Gregory   NaN      F     F     kimberlymcconnell@hotmail.com   

            City Enrollment

In [52]:
 df.head()

Unnamed: 0,StudentID,Name,Age,Gender,Grade,Email,City,EnrollmentDate
0,1000,Jerry,,Other,C,zhill@gmail.com,Los Angeles,NaT
1,1001,Erica,,,D,yayala@walker.com,Los Angeles,2024-03-15
2,1002,Ashley,,F,C,brian89@gmail.com,Los Angeles,2025-03-20
3,1003,Michael,,Other,A,hickmanrebecca@yahoo.com,Phoenix,2024-01-28
4,1004,Eileen,20.0,Other,B,michaelbaker@fernandez-davis.net,New York,2024-07-12


Part 6: Data Cleaning

In [54]:
 # Make a copy of the dataset for cleaning
 cleaned_df = df.copy()
 print(cleaned_df)

     StudentID      Name   Age Gender Grade                             Email  \
0         1000     Jerry   NaN  Other     C                   zhill@gmail.com   
1         1001     Erica   NaN    NaN     D                 yayala@walker.com   
2         1002    Ashley   NaN      F     C                 brian89@gmail.com   
3         1003   Michael   NaN  Other     A          hickmanrebecca@yahoo.com   
4         1004    Eileen  20.0  Other     B  michaelbaker@fernandez-davis.net   
..         ...       ...   ...    ...   ...                               ...   
100       1022  Danielle  18.0      F   NaN          delacruzbarry@montes.net   
101       1080      Jack   NaN    NaN   NaN          ywatkins@kirk-peters.com   
102       1053    Amanda  24.0      F     D             juliebailey@yahoo.com   
103       1061    Justin   NaN      F     F               bryanking@quinn.com   
104       1072   Gregory   NaN      F     F     kimberlymcconnell@hotmail.com   

            City Enrollment

In [56]:
# Remove duplicate rows
cleaned_df = cleaned_df.drop_duplicates()
print(cleaned_df)

    StudentID     Name   Age Gender Grade                             Email  \
0        1000    Jerry   NaN  Other     C                   zhill@gmail.com   
1        1001    Erica   NaN    NaN     D                 yayala@walker.com   
2        1002   Ashley   NaN      F     C                 brian89@gmail.com   
3        1003  Michael   NaN  Other     A          hickmanrebecca@yahoo.com   
4        1004   Eileen  20.0  Other     B  michaelbaker@fernandez-davis.net   
..        ...      ...   ...    ...   ...                               ...   
95       1095  Barbara  19.0  Other     D          johnsonlouis@hotmail.com   
96       1096   Amanda  20.0  Other     A         robinsonkaren@johnson.com   
97       1097   Joanne  18.0    NaN     D              smithlinda@gmail.com   
98       1098    Ricky   NaN      F     F              dawsonchad@gmail.com   
99       1099   Thomas   NaN  Other     A        kennethwarren@bartlett.net   

           City EnrollmentDate  
0   Los Angeles   

In [59]:
 # Fill missing values
 cleaned_df['Age'] = cleaned_df['Age'].fillna(cleaned_df['Age'].mean())
 cleaned_df['Gender'] = cleaned_df['Gender'].fillna(cleaned_df['Gender'].mode()[0])
 cleaned_df['Grade'] = cleaned_df['Grade'].fillna(cleaned_df['Grade'].mode()[0])
 cleaned_df['City'] = cleaned_df['City'].fillna(cleaned_df['City'].mode()[0])
 cleaned_df['EnrollmentDate'] = cleaned_df['EnrollmentDate'].fillna(pd.Timestamp('2023-01-01'))

 print(cleaned_df)

    StudentID     Name        Age Gender Grade  \
0        1000    Jerry  21.860465  Other     C   
1        1001    Erica  21.860465      F     D   
2        1002   Ashley  21.860465      F     C   
3        1003  Michael  21.860465  Other     A   
4        1004   Eileen  20.000000  Other     B   
..        ...      ...        ...    ...   ...   
95       1095  Barbara  19.000000  Other     D   
96       1096   Amanda  20.000000  Other     A   
97       1097   Joanne  18.000000      F     D   
98       1098    Ricky  21.860465      F     F   
99       1099   Thomas  21.860465  Other     A   

                               Email         City EnrollmentDate  
0                    zhill@gmail.com  Los Angeles     2023-01-01  
1                  yayala@walker.com  Los Angeles     2024-03-15  
2                  brian89@gmail.com  Los Angeles     2025-03-20  
3           hickmanrebecca@yahoo.com      Phoenix     2024-01-28  
4   michaelbaker@fernandez-davis.net     New York     2024-07-12

In [60]:
 # View cleaned data
 cleaned_df.head()

Unnamed: 0,StudentID,Name,Age,Gender,Grade,Email,City,EnrollmentDate
0,1000,Jerry,21.860465,Other,C,zhill@gmail.com,Los Angeles,2023-01-01
1,1001,Erica,21.860465,F,D,yayala@walker.com,Los Angeles,2024-03-15
2,1002,Ashley,21.860465,F,C,brian89@gmail.com,Los Angeles,2025-03-20
3,1003,Michael,21.860465,Other,A,hickmanrebecca@yahoo.com,Phoenix,2024-01-28
4,1004,Eileen,20.0,Other,B,michaelbaker@fernandez-davis.net,New York,2024-07-12



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Part 7: Data Integration

In [64]:
# Create simulated additional data for merging
import random
additional_data = pd.DataFrame({
    'StudentID': cleaned_df['StudentID'].sample(frac=0.8).values,
    'Club': np.random.choice(['Science', 'Art', 'Sports', 'None'], size=int(0.8 * len(cleaned_df)))
})

In [65]:
print(additional_data)

    StudentID     Club
0        1024  Science
1        1005      Art
2        1019   Sports
3        1037      Art
4        1069      Art
..        ...      ...
75       1087   Sports
76       1007     None
77       1052  Science
78       1022      Art
79       1083   Sports

[80 rows x 2 columns]


In [66]:
# Merge on StudentID
merged_df = pd.merge(cleaned_df, additional_data, on='StudentID', how='left')

In [67]:
 # View merged data
 merged_df.head()

Unnamed: 0,StudentID,Name,Age,Gender,Grade,Email,City,EnrollmentDate,Club
0,1000,Jerry,21.860465,Other,C,zhill@gmail.com,Los Angeles,2023-01-01,
1,1001,Erica,21.860465,F,D,yayala@walker.com,Los Angeles,2024-03-15,Art
2,1002,Ashley,21.860465,F,C,brian89@gmail.com,Los Angeles,2025-03-20,
3,1003,Michael,21.860465,Other,A,hickmanrebecca@yahoo.com,Phoenix,2024-01-28,Art
4,1004,Eileen,20.0,Other,B,michaelbaker@fernandez-davis.net,New York,2024-07-12,Sports


Part 8: Data Transformation

In [69]:
# Make a copy for transformation
transformed_df = merged_df.copy()

In [70]:
print(transformed_df)

    StudentID     Name        Age Gender Grade  \
0        1000    Jerry  21.860465  Other     C   
1        1001    Erica  21.860465      F     D   
2        1002   Ashley  21.860465      F     C   
3        1003  Michael  21.860465  Other     A   
4        1004   Eileen  20.000000  Other     B   
..        ...      ...        ...    ...   ...   
95       1095  Barbara  19.000000  Other     D   
96       1096   Amanda  20.000000  Other     A   
97       1097   Joanne  18.000000      F     D   
98       1098    Ricky  21.860465      F     F   
99       1099   Thomas  21.860465  Other     A   

                               Email         City EnrollmentDate    Club  
0                    zhill@gmail.com  Los Angeles     2023-01-01     NaN  
1                  yayala@walker.com  Los Angeles     2024-03-15     Art  
2                  brian89@gmail.com  Los Angeles     2025-03-20    None  
3           hickmanrebecca@yahoo.com      Phoenix     2024-01-28     Art  
4   michaelbaker@fernand

In [71]:
 # One-hot encode categorical columns
 transformed_df = pd.get_dummies(transformed_df, columns=['Gender', 'Grade', 'Club'], drop_first=True)

In [72]:
print(transformed_df)

    StudentID     Name        Age                             Email  \
0        1000    Jerry  21.860465                   zhill@gmail.com   
1        1001    Erica  21.860465                 yayala@walker.com   
2        1002   Ashley  21.860465                 brian89@gmail.com   
3        1003  Michael  21.860465          hickmanrebecca@yahoo.com   
4        1004   Eileen  20.000000  michaelbaker@fernandez-davis.net   
..        ...      ...        ...                               ...   
95       1095  Barbara  19.000000          johnsonlouis@hotmail.com   
96       1096   Amanda  20.000000         robinsonkaren@johnson.com   
97       1097   Joanne  18.000000              smithlinda@gmail.com   
98       1098    Ricky  21.860465              dawsonchad@gmail.com   
99       1099   Thomas  21.860465        kennethwarren@bartlett.net   

           City EnrollmentDate  Gender_M  Gender_Other  Grade_B  Grade_C  \
0   Los Angeles     2023-01-01     False          True    False     Tru

In [73]:
 # Normalize Age (Min-Max Scaling)
 transformed_df['Age_Normalized'] = (
    (transformed_df['Age'] - transformed_df['Age'].min()) /
    (transformed_df['Age'].max() - transformed_df['Age'].min())
 )

 print(transformed_df)

    StudentID     Name        Age                             Email  \
0        1000    Jerry  21.860465                   zhill@gmail.com   
1        1001    Erica  21.860465                 yayala@walker.com   
2        1002   Ashley  21.860465                 brian89@gmail.com   
3        1003  Michael  21.860465          hickmanrebecca@yahoo.com   
4        1004   Eileen  20.000000  michaelbaker@fernandez-davis.net   
..        ...      ...        ...                               ...   
95       1095  Barbara  19.000000          johnsonlouis@hotmail.com   
96       1096   Amanda  20.000000         robinsonkaren@johnson.com   
97       1097   Joanne  18.000000              smithlinda@gmail.com   
98       1098    Ricky  21.860465              dawsonchad@gmail.com   
99       1099   Thomas  21.860465        kennethwarren@bartlett.net   

           City EnrollmentDate  Gender_M  Gender_Other  Grade_B  Grade_C  \
0   Los Angeles     2023-01-01     False          True    False     Tru

In [74]:
 # Convert EnrollmentDate to datetime
 transformed_df['EnrollmentDate'] = pd.to_datetime(transformed_df['EnrollmentDate'])

In [75]:
 # View transformed data
 transformed_df.head()

Unnamed: 0,StudentID,Name,Age,Email,City,EnrollmentDate,Gender_M,Gender_Other,Grade_B,Grade_C,Grade_D,Grade_F,Club_None,Club_Science,Club_Sports,Age_Normalized
0,1000,Jerry,21.860465,zhill@gmail.com,Los Angeles,2023-01-01,False,True,False,True,False,False,False,False,False,0.551495
1,1001,Erica,21.860465,yayala@walker.com,Los Angeles,2024-03-15,False,False,False,False,True,False,False,False,False,0.551495
2,1002,Ashley,21.860465,brian89@gmail.com,Los Angeles,2025-03-20,False,False,False,True,False,False,True,False,False,0.551495
3,1003,Michael,21.860465,hickmanrebecca@yahoo.com,Phoenix,2024-01-28,False,True,False,False,False,False,False,False,False,0.551495
4,1004,Eileen,20.0,michaelbaker@fernandez-davis.net,New York,2024-07-12,False,True,True,False,False,False,False,False,True,0.285714


Part 9: Data Reduction

In [76]:
# Drop irrelevant or less useful columns
reduced_df = transformed_df.drop(columns=['Email', 'City', 'EnrollmentDate'])

In [77]:
print(reduced_df)

    StudentID     Name        Age  Gender_M  Gender_Other  Grade_B  Grade_C  \
0        1000    Jerry  21.860465     False          True    False     True   
1        1001    Erica  21.860465     False         False    False    False   
2        1002   Ashley  21.860465     False         False    False     True   
3        1003  Michael  21.860465     False          True    False    False   
4        1004   Eileen  20.000000     False          True     True    False   
..        ...      ...        ...       ...           ...      ...      ...   
95       1095  Barbara  19.000000     False          True    False    False   
96       1096   Amanda  20.000000     False          True    False    False   
97       1097   Joanne  18.000000     False         False    False    False   
98       1098    Ricky  21.860465     False         False    False    False   
99       1099   Thomas  21.860465     False          True    False    False   

    Grade_D  Grade_F  Club_None  Club_Science  Club

In [78]:
 # Final preprocessed dataset
reduced_df.head()

Unnamed: 0,StudentID,Name,Age,Gender_M,Gender_Other,Grade_B,Grade_C,Grade_D,Grade_F,Club_None,Club_Science,Club_Sports,Age_Normalized
0,1000,Jerry,21.860465,False,True,False,True,False,False,False,False,False,0.551495
1,1001,Erica,21.860465,False,False,False,False,True,False,False,False,False,0.551495
2,1002,Ashley,21.860465,False,False,False,True,False,False,True,False,False,0.551495
3,1003,Michael,21.860465,False,True,False,False,False,False,False,False,False,0.551495
4,1004,Eileen,20.0,False,True,True,False,False,False,False,False,True,0.285714
