# Machine Learning

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Creating a DataFrame with various date formats

Data = {
    "Date": ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021', '2023-03-15'],
    "Country": ['Pak', 'PK', 'Pakistani', 'PAK', 'PAKISTANI'],
    "Name": ['Aassad', 'Alyy', 'Andna', 'Zainab', 'Kainat'],
    "Age": [25, 30, 22, 28, 35],
    "Score": [88.5, 92.0, 79.5, 85.0, 90.5]
}
df = pd.DataFrame(Data)
print(df)

         Date    Country    Name  Age  Score
0  2021-12-01        Pak  Aassad   25   88.5
1  01-12-2022         PK    Alyy   30   92.0
2  2022/12/01  Pakistani   Andna   22   79.5
3  12-01-2021        PAK  Zainab   28   85.0
4  2023-03-15  PAKISTANI  Kainat   35   90.5


In [3]:
# Convert All types of date formats to a standard format (YYYY-MM-DD)
def parse_date(x):
    try:
        # Try ISO format first
        return pd.to_datetime(x, format='%Y-%m-%d')
    except Exception:
        # Try ambiguous formats
        for day in (True, False):
            dt = pd.to_datetime(x, errors='coerce', dayfirst=day)
            if not pd.isna(dt):
                return dt
    return pd.NaT

df['Date'] = df['Date'].apply(parse_date)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
print(df)

         Date    Country    Name  Age  Score
0  2021-12-01        Pak  Aassad   25   88.5
1  2022-12-01         PK    Alyy   30   92.0
2  2022-01-12  Pakistani   Andna   22   79.5
3  2021-01-12        PAK  Zainab   28   85.0
4  2023-03-15  PAKISTANI  Kainat   35   90.5


In [4]:
# Harmonizing Country names
Country_Mapping =  {'Pak': 'Pakistan', 'PK': 'Pakistan', 'Pakistani': 'Pakistan', 'PAK': 'Pakistan', 'PAKISTANI': 'Pakistan'}
df["Country"] = df["Country"].replace(Country_Mapping)
print(df)


         Date   Country    Name  Age  Score
0  2021-12-01  Pakistan  Aassad   25   88.5
1  2022-12-01  Pakistan    Alyy   30   92.0
2  2022-01-12  Pakistan   Andna   22   79.5
3  2021-01-12  Pakistan  Zainab   28   85.0
4  2023-03-15  Pakistan  Kainat   35   90.5


In [5]:
# Correcting the Typographical errors in the Name column
Name_Mapping = {'Aassad':'Asad', 'Alyy':'Ali', 'Andna':'Adnan', 'Zainab':'Ali', 'Kainat':'Adnan'}
df["Name"] = df["Name"].replace(Name_Mapping)
print(df)

         Date   Country   Name  Age  Score
0  2021-12-01  Pakistan   Asad   25   88.5
1  2022-12-01  Pakistan    Ali   30   92.0
2  2022-01-12  Pakistan  Adnan   22   79.5
3  2021-01-12  Pakistan    Ali   28   85.0
4  2023-03-15  Pakistan  Adnan   35   90.5


In [6]:
# Removing duplicates
df = df.drop_duplicates(subset="Name")
print(df)
# When you run this command you got duplicate free data basedn on the Name column

         Date   Country   Name  Age  Score
0  2021-12-01  Pakistan   Asad   25   88.5
1  2022-12-01  Pakistan    Ali   30   92.0
2  2022-01-12  Pakistan  Adnan   22   79.5


In [9]:
# Displaying the final DataFrame
print(df)

         Date   Country  Name  Age  Score
0  2021-12-01  Pakistan  Asad   25   88.5
1  2022-12-01  Pakistan   Ali   30   92.0


In [8]:
# Dropping rows which score is less than 89 and higher than or equal to 92

df = df[(df['Score'] > 85) & (df['Score'] <= 92)]
print(df)

         Date   Country  Name  Age  Score
0  2021-12-01  Pakistan  Asad   25   88.5
1  2022-12-01  Pakistan   Ali   30   92.0
