# Introduction to Pandas

# what is pandas?
***Pandas is a Python library used for data analysis and data manipulation.***

# ==============================================================
# 📌 Why We Use Pandas in Machine Learning
# ==============================================================

# 1️⃣ Data Handling (Tabular Data)
# - Real-world datasets (CSV, Excel, SQL) are tabular.
# - Pandas DataFrame makes it easy to store & manipulate such data.

# Example:
# df = pd.read_csv("data.csv")
# print(df.head())

# --------------------------------------------------------------

# 2️⃣ Data Cleaning
# - ML models require clean data.
# - Pandas provides:
#   * dropna() → remove missing values
#   * fillna() → replace missing values
#   * drop_duplicates() → remove duplicates
#   * astype() → fix data types
#   * replace() → fix wrong values

# --------------------------------------------------------------

# 3️⃣ Data Transformation
# - Create new features (apply(), assign()).
# - Normalize / scale data before training.
# - Convert categorical data to numbers (get_dummies()).

# --------------------------------------------------------------

# 4️⃣ Data Exploration (EDA)
# - Understand data before training.
#   * describe() → summary statistics
#   * value_counts() → frequency counts
#   * groupby() → grouping and aggregation
#   * loc[] / iloc[] → filtering rows/columns

# --------------------------------------------------------------

# 5️⃣ Integration with ML Libraries
# - Pandas works smoothly with scikit-learn, TensorFlow, PyTorch.
# - DataFrame → easily converted to NumPy arrays for ML models.
# - Makes data preprocessing → model training pipeline efficient.

# --------------------------------------------------------------

# 6️⃣ Input / Output (I/O)
# - Read from CSV, Excel, SQL, JSON.
# - Save/export cleaned data for future use.

# --------------------------------------------------------------

# ✅ Summary:
# - Pandas is NOT used to train ML models.
# - Pandas is used for data preprocessing, cleaning, exploration,
#   and transformation (80% of ML work).
# - Without Pandas, ML data preparation would be slow & hard.

# 👉 In short:
# "Pandas helps prepare clean, structured, numerical data
#  that machine learning models can understand."
# ==============================================================



In [6]:
from pandas import DataFrame

print("Introduction to Pandas")


Introduction to Pandas


In [7]:
!conda install pandas

Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.





    current version: 25.3.1
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c defaults conda




# key data structures in pandas
* series
* dataFrame

In [11]:
import pandas as pd

In [15]:
s = pd.Series([1,2,3,4,5] , index=['a','b','c','d','e'])

In [16]:
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [20]:
df = pd.DataFrame({"Name" : ["Asad","Asif","Ramy"],"Marks": [90 , 80 , 77]})

In [21]:
df

Unnamed: 0,Name,Marks
0,Asad,90
1,Asif,80
2,Ramy,77


In [23]:
dfd = pd.read_csv("data/sample.csv")

In [24]:
dfd

Unnamed: 0,Game Number,"""Game Length"""
0,1,30
1,2,29
2,3,31
3,4,16
4,5,24
...,...,...
995,996,24
996,997,21
997,998,47
998,999,30


In [25]:
dfd.head()

Unnamed: 0,Game Number,"""Game Length"""
0,1,30
1,2,29
2,3,31
3,4,16
4,5,24


In [26]:
dfd.tail()

Unnamed: 0,Game Number,"""Game Length"""
995,996,24
996,997,21
997,998,47
998,999,30
999,1000,12


In [27]:
dfd.describe()

Unnamed: 0,Game Number,"""Game Length"""
count,1000.0,1000.0
mean,500.5,35.411
std,288.819436,23.412608
min,1.0,7.0
25%,250.75,19.0
50%,500.5,30.0
75%,750.25,45.0
max,1000.0,197.0


In [29]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Game Number     1000 non-null   int64
 1    "Game Length"  1000 non-null   int64
dtypes: int64(2)
memory usage: 15.8 KB


In [31]:
type(dfd['Game Number'])

pandas.core.series.Series

In [36]:
dfd[['Game Number']]

Unnamed: 0,Game Number
0,1
1,2
2,3
3,4
4,5
...,...
995,996
996,997
997,998
998,999


In [37]:
dfd.iloc[0]

Game Number        1
 "Game Length"    30
Name: 0, dtype: int64

In [4]:
import pandas as pd

In [5]:
dfd = pd.read_csv("data/sample.csv")

In [6]:
dfd

Unnamed: 0,Game Number,"""Game Length"""
0,1.0,30.0
1,2.0,
2,3.0,31.0
3,4.0,
4,,24.0
...,...,...
995,996.0,24.0
996,997.0,21.0
997,998.0,47.0
998,999.0,30.0


In [7]:
dfd.dropna()

Unnamed: 0,Game Number,"""Game Length"""
0,1.0,30.0
2,3.0,31.0
5,6.0,29.0
7,8.0,117.0
8,9.0,42.0
...,...,...
995,996.0,24.0
996,997.0,21.0
997,998.0,47.0
998,999.0,30.0


In [9]:
dfd.fillna(0)

Unnamed: 0,Game Number,"""Game Length"""
0,1.0,30.0
1,2.0,0.0
2,3.0,31.0
3,4.0,0.0
4,0.0,24.0
...,...,...
995,996.0,24.0
996,997.0,21.0
997,998.0,47.0
998,999.0,30.0


In [10]:
dfd

Unnamed: 0,Game Number,"""Game Length"""
0,1.0,30.0
1,2.0,
2,3.0,31.0
3,4.0,
4,,24.0
...,...,...
995,996.0,24.0
996,997.0,21.0
997,998.0,47.0
998,999.0,30.0


In [11]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Game Number     998 non-null    float64
 1    "Game Length"  994 non-null    float64
dtypes: float64(2)
memory usage: 15.8 KB


In [15]:
dfd['Game Number'] = dfd['Game Number'].astype(float)

In [17]:
dfd

Unnamed: 0,Game Number,"""Game Length"""
0,1.0,30.0
1,2.0,
2,3.0,31.0
3,4.0,
4,,24.0
...,...,...
995,996.0,24.0
996,997.0,21.0
997,998.0,47.0
998,999.0,30.0


In [19]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Game Number     998 non-null    float64
 1    "Game Length"  994 non-null    float64
dtypes: float64(2)
memory usage: 15.8 KB


In [20]:
dfd['Game Number'][0]

np.float64(1.0)

In [21]:
dfd["zeroes"] = [0 for i in range(len(dfd))]

In [22]:
dfd

Unnamed: 0,Game Number,"""Game Length""",zeroes
0,1.0,30.0,0
1,2.0,,0
2,3.0,31.0,0
3,4.0,,0
4,,24.0,0
...,...,...,...
995,996.0,24.0,0
996,997.0,21.0,0
997,998.0,47.0,0
998,999.0,30.0,0


In [23]:
def fx(a):
    return a * a
dfd["zeroes + 1"] = dfd["Game Number"].apply(fx)

In [24]:
dfd

Unnamed: 0,Game Number,"""Game Length""",zeroes,zeroes + 1
0,1.0,30.0,0,1.0
1,2.0,,0,4.0
2,3.0,31.0,0,9.0
3,4.0,,0,16.0
4,,24.0,0,
...,...,...,...,...
995,996.0,24.0,0,992016.0
996,997.0,21.0,0,994009.0
997,998.0,47.0,0,996004.0
998,999.0,30.0,0,998001.0


In [29]:
dfd.to_csv("data/export.csv", index=False)

In [30]:
dfd

Unnamed: 0,Game Number,"""Game Length""",zeroes,zeroes + 1
0,1.0,30.0,0,1.0
1,2.0,,0,4.0
2,3.0,31.0,0,9.0
3,4.0,,0,16.0
4,,24.0,0,
...,...,...,...,...
995,996.0,24.0,0,992016.0
996,997.0,21.0,0,994009.0
997,998.0,47.0,0,996004.0
998,999.0,30.0,0,998001.0


In [31]:
df1 = pd.DataFrame({
    "names" : ["asad","asif","ramy"],
    "marks" : [30,40,50]
})

In [32]:
df1

Unnamed: 0,names,marks
0,asad,30
1,asif,40
2,ramy,50


In [33]:

df2 = pd.DataFrame({
    "names": ["asad", "asif", "ramy"],
    "marks": [30, 40, 50]
})

In [34]:
df2

Unnamed: 0,names,marks
0,asad,30
1,asif,40
2,ramy,50
