**Loading Data in Machine Learning**

*Manual Data Load*

In [None]:
# manual data load
import pandas as pd

data = {
    'Name' : ['Maryam', 'Amna', 'Fatima'],
    'Age': [24,25,26],
    'Hobby': [ 'teasing', 'crying', 'working'],
    'salary': [1000, 2000, 3000]
}

manual_data = pd.DataFrame(data)
print('\n---------MANUAL DATA LOAD----------\n')
print(manual_data)

*CSV Data Load (online and from file*

In [None]:
import pandas as pd

# csv data load

csv_data = pd.read_csv('/content/cars.csv')
print("\n-------CSV data-------\n")
print(csv_data.head(10))

online_csv_url = 'https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv'
online_csv_data = pd.read_csv(online_csv_url)
print("\n-------Online csv data-------\n")
print(online_csv_data.head())



*JSON File Load*

In [None]:
import pandas as pd
json_data = pd.read_json('/content/attendance.json')
print('\n------JSON DATA file-------\n')
print(json_data.head(10))

json_data_url = 'https://jsonplaceholder.typicode.com/users'
json_data_online = pd.read_json(json_data_url)
print('\n-----------------JSON DATA ONLINE-------------------\n')
print(json_data_online.head())


*Excel Data Load*

In [None]:
import pandas as pd
excel_data = pd.read_excel('/content/extra.xlsx')
print('\n----------EXCEL DATA------------\n')
print(excel_data.head())

*Text Data Load*

In [None]:
import pandas as pd

text_data = pd.read_csv('/content/data.txt', delimiter='\t')
print('\n----------TEXT DATA------------\n')
print(text_data)

*Creating sample data with numpy by
random data*

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)
data = {
    'id': range(1,6),
    'score' : np.random.randint(1,100,5),
    'height' :np.random.normal(170,10,5).round(1),
    'weight' : np.random.normal(60,15,5).round(1)
}

sample_data= pd.DataFrame(data)
print('\n--------RANDOM DATA---------\n')
print(sample_data)

**DATA PRE-PROCESSING**

*Label encoding- for the ordered data*

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# define data
data = pd.DataFrame ({
    'classes' : ['primary', 'secondary', 'undergraduate', 'postgraduate', 'secondary']
})

label_encoder = LabelEncoder()
data['classes'] = label_encoder.fit_transform(data['classes'])
print(data.value_counts())


*One Hot encoding- for random categories*

In [None]:

data = {
    'students': ['Maryam', 'Amna', 'Fatima', 'Maryam', 'Fatima']
}

one_Hot_data = pd.get_dummies(data['students'])
print(one_Hot_data.value_counts())



# Lab Home Task Activties
*Activity 1*

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

#creating the province dataset (Manual Data Creation)
province_data = {
    "Province": ["Punjab", "Sindh", "KPK", "Balochistan" ],
    "Population": [200, 400, 300, None],
    "LiteracyRate": [90,70,None,20],
    "Region":["North", "South", "East", "West"]
}
province_df = pd.DataFrame(province_data)
print(province_df)

#===================Task 1: Dropping Row and taking median of other=============================

#Dropping row with missing values (Data Preprocessing)
df_drop_LiteracyRate = province_df.dropna(subset=["LiteracyRate"])
print("\n Drp the row of LITERACY RATE\n")
print(df_drop_LiteracyRate)

#Taking median of missing value row
median_population = df_drop_LiteracyRate["Population"].median()
df_median_population = df_drop_LiteracyRate.fillna({"Population": median_population})
print("\n Median Population Row\n")
print(df_median_population)

df_1 = df_median_population

#===================Task 2: Label Encoding and one-hot encoding=============================

Region_le = LabelEncoder()
df_1["Region"] = Region_le.fit_transform(df_1["Region"])
print("\n Label Encoding\n")
print(df_median_population)

Region_he = pd.get_dummies(df_1["Region"])
print("\n One-Hot Encoding\n")
print(Region_he)
print("\n")

#===================Task 3: Visualization: Population and Literacy Rate using sctatter plot=============================
plt.figure(figsize=(8,6))
plt.scatter(df_1["Population"], df_1["LiteracyRate"])

for i, txt in enumerate(df_1["Province"]):
    plt.text(df_1["Population"].iloc[i] + 5,
             df_1["LiteracyRate"].iloc[i],
             txt, fontsize=10)

plt.xlabel("Population")
plt.ylabel("LiteracyRate")
plt.title("Population vs LiteracyRate")
plt.grid(True)
plt.show()

print("\nBalochistan is the outlier in literacy Rate here\n")



*Activity 2*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns


# Random data creation
np.random.seed(32)
Students = {
    "ID": range(1,101),
    "Maths": np.random.randint(0,101,100),
    "Sceince": np.random.randint(0,101,100),
    "English": np.random.randint(0,101,100),
    "Grade": np.random.choice(['A','B','C'],100)
}
Students_df = pd.DataFrame(Students)

# Adding random values
Students_df.loc[5, "Maths"] = np.nan
Students_df.loc[2, "English"] = np.nan
Students_df.loc[1, "English"] = np.nan

#printing the data frame
print(Students_df.head(10))

#===================Task 1: Taking mean of missing values ===================

Students_df["Maths"] = Students_df["Maths"].fillna(Students_df['Maths'].mean())
Students_df["English"]=Students_df["English"].fillna(Students_df['English'].mean())
print("\n After mean DATASET\n")
print(Students_df.head(10))


#===================Task 2: Encoding missing values ===================
Grades_le = LabelEncoder()
Students_df["Grade"]= Grades_le.fit_transform(Students_df['Grade'])


#===================Task 3: Plotting Histogram ===================
plt.figure(figsize=(8,5))
plt.hist(Students_df["Maths"], bins=10, color="purple", edgecolor="black")
plt.xlabel("Maths Scores")
plt.ylabel("Number of Students")
plt.title("Maths Scores")
plt.grid(True, alpha=0.3)
plt.show()

#===================Task 4: Plotting Boxplot ===================
plt.figure(figsize=(7,4))
Students_df["Total"] = Students_df["Maths"] + Students_df["Sceince"] + Students_df["English"]
sns.boxplot(x=Students_df["Total"])
plt.title("Total Score Boxplot")
plt.show()

print("\n Right now no student got unsually high ir low valu\n")

#===================Task 5: Summarizing Findings ===================
Grade_Summary = Students_df.groupby("Grade")["Total"].mean()
print(Grade_Summary)
print("\nGrade B performed best overall\n")


*Activity3*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading data from online
covid_url = "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv"
covid_df = pd.read_csv(covid_url)
# print(covid_df.head(),"\n")

#===================Task 1: Extracting Pakistans Data ===================
pak_df = covid_df[covid_df["Country/Region"] == "Pakistan"].copy()
print(pak_df.head(),"\n")
#===================Task 2: Handling Missing Values ===================
# pak_df.info()
pak_df = pak_df.drop(columns=["Province/State"])
print(pak_df.head(),"\n")
# pak_df.info()
#===================Task 3: Line Chart ===================
plt.figure(figsize=(7,4))
plt.plot(pak_df["Date"], pak_df["Confirmed"], label="Confirmed", color="blue")
plt.ylabel("Confirmed Cases")
plt.xlabel("Date")
plt.title("COVID-19 Pakistan")
plt.show()
#===================Task 4: Highest confirmed cases ===================

pak_df = pak_df.sort_values("Date")
pak_df['Confirmed'] = pak_df['Confirmed'].diff().fillna(0)

day = pak_df.loc[pak_df["Confirmed"].idxmax()]
print("\nDay with highest cases\n",day, "\n")

#===================Task 5: Boxplot ===================
plt.figure(figsize=(7,4))
sns.boxplot(x=pak_df["Confirmed"])
plt.title("Covid-19 Pakistan")
plt.show()


*Activity4*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ===================Task1: Loading Datsets==============
students_df = pd.read_csv('/content/students.csv')
attendance_df = pd.read_json('/content/attendance.json')
extra_df = pd.read_excel('/content/extra.xlsx')

# ===================Task2: Merging Datasets==============
merged_df = students_df.merge(attendance_df, on="Name").merge(extra_df, on="Name")
print(merged_df)

# ===================Task3: Scatter plot==============
plt.figure(figsize=(7,4))
plt.scatter(merged_df["Attendance"], merged_df["Marks"], color="blue", label="Students")

# students with less than 70 attendance
low_att = merged_df[merged_df["Attendance"] < 70]
plt.scatter(low_att["Attendance"], low_att["Marks"], color="red", label="Attendance < 70%")

plt.xlabel("Attendance")
plt.ylabel("Marks")
plt.title("Marks vs Attendance")
plt.legend()
plt.show()

# ===================Task4: On ehot encoding==============
Name_he = pd.get_dummies(merged_df["Name"])
print( Name_he.head())

*Activity 5*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = {
    "Name": ["Ahsan", "Hira", "Bilal", "Zara", "Salman", "Mahnoor"],
    "Age": [25,27,35,29,None,40],
    "Salary": [50000, None, 75000, 2000000, 60000, 90000],
    "Department": ["IT", "Finance", "IT", "HR", "Finance", "IT"]
}
df = pd.DataFrame(data)

# Task1

df["Age"]=df["Age"].fillna(df["Age"].mean())
df["Salary"]=df["Salary"].fillna(df["Salary"].median())

print(df)

#Task2
df_encoded = pd.get_dummies(df,columns=["Department"])
print(df_encoded)

#Task3
plt.figure(figsize=(7,4))
sns.boxplot(x=df["Salary"])
plt.title("Outliers in Salary")
plt.xlabel("Salary")
plt.show()

#Task4
print("Yes, zara's salary is treated as outlier as we see in boxplot its outside the othe ones")
