# Pandas for Data Science Tutorial

This notebook provides a detailed tutorial on using pandas for data handling in data science.

## 1. Introduction to pandas

### Creating DataFrames

In [None]:
import pandas as pd

# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}

print(data)
df = pd.DataFrame(data)
df

{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 'Age': [24, 27, 22, 32, 29], 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']}


Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,27,Los Angeles
2,Charlie,22,Chicago
3,David,32,Houston
4,Eve,29,Phoenix


In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) # Changed number of columns in randn to 2 to match column names.
df

Unnamed: 0,A,B
0,0.469809,0.372901
1,-1.039874,0.484188
2,0.133136,1.254706
3,1.50669,0.684014
4,-0.997056,0.493645
5,0.147699,-0.352538
6,-1.172035,0.946056
7,-1.76358,0.798111
8,-0.503468,0.769229
9,-1.102623,0.934903


In [None]:
ls

[0m[01;34msample_data[0m/


### Reading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
cd /content/drive/MyDrive/Machine learning 2024/Notebooks - Machine Learning/KNN

/content/drive/MyDrive/Machine learning 2024/Notebooks - Machine Learning/KNN


In [None]:
ls

adult.csv  adult-income-k-nearest-neighbors-knn.ipynb


In [None]:
# Reading a CSV file
df_csv = pd.read_csv('adult.csv')
df_csv.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [None]:
df_csv.tail(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
48832,32,Private,34066,10th,6,Married-civ-spouse,Handlers-cleaners,Husband,Amer-Indian-Eskimo,Male,0,0,40,United-States,<=50K
48833,43,Private,84661,Assoc-voc,11,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,<=50K
48834,32,Private,116138,Masters,14,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0,0,11,Taiwan,<=50K
48835,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
48836,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
48841,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [None]:
# Reading an Excel file
df_excel = pd.read_excel('Ahmed.xlsx')
df_excel.head()

Unnamed: 0,Gov Code,RecommendTrack,RecommendJobProfile,Graduation,Days,Round Code,Sum of Count,Training Provider,IT ONL Start Date,IT ONL Days,IT ONL Time,IT PHY Start Date,IT PHY Days,IT PHY Time,IT Instructor Name,New Time
0,QAL,AI & Data Science,Microsoft Machine Learning Engineer,G,e,QAL1_AIS5_G1e,14,GLB,2024-05-09,Frid,9AM-12PM,,SAT,9AM-12AM,Ahmed Yousry,
1,QAL,AI & Data Science,Microsoft Machine Learning Engineer,S,e,QAL1_AIS5_S1e,20,GLB,2024-04-30,Frid,2PM-5PM,,SAT,2PM-5PM,Ahmed Yousry,
2,QAL,AI & Data Science,Microsoft Machine Learning Engineer,S,d,QAL1_AIS5_S3d,16,GLB,2024-04-29,Sun,6PM-9PM,,SAT,5PM-8PM,Ahmed Yousry,


### Data Inspection

In [None]:
# Displaying the first five rows
df_csv.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
# Displaying the last five rows
df_csv.tail()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
48841,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [None]:
df_csv.shape

(48842, 15)

In [None]:
# Displaying summary statistics
df_csv.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,48842.0,38.643585,13.71051,17.0,28.0,37.0,48.0,90.0
fnlwgt,48842.0,189664.134597,105604.025423,12285.0,117550.5,178144.5,237642.0,1490400.0
educational-num,48842.0,10.078089,2.570973,1.0,9.0,10.0,12.0,16.0
capital-gain,48842.0,1079.067626,7452.019058,0.0,0.0,0.0,0.0,99999.0
capital-loss,48842.0,87.502314,403.004552,0.0,0.0,0.0,0.0,4356.0
hours-per-week,48842.0,40.422382,12.391444,1.0,40.0,40.0,45.0,99.0


In [None]:

# Displaying information about the DataFrame
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [None]:
df_csv.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [1]:
import pandas as pd

# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', None, 'Charlie', 'David', 'Eve', 'Eve'],
    'Age': [24, 27, None, 32, 29, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix','Phoenix']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,,27.0,Los Angeles
2,Charlie,,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix
5,Eve,29.0,Phoenix


### Data Cleaning

In [2]:
# Handling missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,,27.0,Los Angeles
2,Charlie,28.2,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix
5,Eve,29.0,Phoenix


In [3]:
# prompt: need to fil nan values in Name featuire with most frquent

df['Name'].fillna(df['Name'].mode()[0], inplace=True)
df


Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Eve,27.0,Los Angeles
2,Charlie,28.2,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix
5,Eve,29.0,Phoenix


In [4]:
df.dropna(subset=['Name'], inplace=True)
df

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Eve,27.0,Los Angeles
2,Charlie,28.2,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix
5,Eve,29.0,Phoenix


In [5]:
# prompt: i need to check for dublication in df

# Checking for duplicates
print(df.duplicated())

# Removing duplicates
df.drop_duplicates(inplace=True)
df


0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool


Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Eve,27.0,Los Angeles
2,Charlie,28.2,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix


In [6]:
# Renaming columns
df.rename(columns={'City': 'Location' }, inplace=True)
df

Unnamed: 0,Name,Age,Location
0,Alice,24.0,New York
1,Eve,27.0,Los Angeles
2,Charlie,28.2,Chicago
3,David,32.0,Houston
4,Eve,29.0,Phoenix


### Data Transformation

In [7]:
# Applying functions to columns
df['Age'] = df['Age'].apply(lambda x: x + 1)

# Binning data
df['Age Group'] = pd.cut(df['Age'], bins=[20,25, 30, 40], labels=['20-25','25-30' ,'30-40'])

In [8]:
df

Unnamed: 0,Name,Age,Location,Age Group
0,Alice,25.0,New York,20-25
1,Eve,28.0,Los Angeles,25-30
2,Charlie,29.2,Chicago,25-30
3,David,33.0,Houston,30-40
4,Eve,30.0,Phoenix,25-30


In [9]:
# Encoding categorical variables
df = pd.get_dummies(df, columns=['Location'])
df

Unnamed: 0,Name,Age,Age Group,Location_Chicago,Location_Houston,Location_Los Angeles,Location_New York,Location_Phoenix
0,Alice,25.0,20-25,False,False,False,True,False
1,Eve,28.0,25-30,False,False,True,False,False
2,Charlie,29.2,25-30,True,False,False,False,False
3,David,33.0,30-40,False,True,False,False,False
4,Eve,30.0,25-30,False,False,False,False,True


In [10]:
# prompt: convert Location to number using label encoder

import pandas as pd
from google.colab import drive
from sklearn.preprocessing import LabelEncoder

data = {
    'Name': ['Alice', None, 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,,27,Los Angeles
2,Charlie,22,Chicago
3,David,32,Houston
4,Eve,29,Phoenix


In [11]:
le = LabelEncoder()
df['City'] = le.fit_transform(df['City'])
df

Unnamed: 0,Name,Age,City
0,Alice,24,3
1,,27,2
2,Charlie,22,0
3,David,32,1
4,Eve,29,4


In [12]:
# prompt: ineed to convert the City column using dummies
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
df = pd.DataFrame(data)

# Encoding categorical variables
df = pd.get_dummies(df, columns=['City'])
df

Unnamed: 0,Name,Age,City_Chicago,City_Houston,City_Los Angeles,City_New York,City_Phoenix
0,Alice,24,False,False,False,True,False
1,Bob,27,False,False,True,False,False
2,Charlie,22,True,False,False,False,False
3,David,32,False,True,False,False,False
4,Eve,29,False,False,False,False,True


### Merging and Joining DataFrames

In [13]:
# Creating another DataFrame
data2 = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Salary': [70000, 80000, 120000, 90000, 110000]
}
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Name,Salary
0,Alice,70000
1,Bob,80000
2,Charlie,120000
3,David,90000
4,Eve,110000


In [14]:
# Merging DataFrames
merged_df = pd.merge(df, df2, on='Name')
merged_df

Unnamed: 0,Name,Age,City_Chicago,City_Houston,City_Los Angeles,City_New York,City_Phoenix,Salary
0,Alice,24,False,False,False,True,False,70000
1,Bob,27,False,False,True,False,False,80000
2,Charlie,22,True,False,False,False,False,120000
3,David,32,False,True,False,False,False,90000
4,Eve,29,False,False,False,False,True,110000
