# PANDAS COURSE

### What is Pandas?

Pandas is an open-source library in Python used for data manipulation, analysis, and cleaning. It provides data structures and functions to efficiently manipulate numerical tables and time series data.

### Installation

In [None]:
# Installation
# pip install pandas

### Importing Pandas

In [1]:
# Importing Pandas
import pandas as pd

### Data Structures in Pandas

Data Structures in Pandas
Pandas primarily deals with three data structures:

1. Series: A one-dimensional labeled array capable of holding any data type.
2. DataFrame: A two-dimensional labeled data structure with columns of potentially different types.
3. Index: An immutable array used for axis labels and axis names.

### Creating a Series

In [13]:
# Creating a Series from a list
data = pd.Series([1, 2, 3, 4])
data

0    1
1    2
2    3
3    4
dtype: int64

### Creating a DataFrame

In [14]:
# Creating a DataFrame from a dictionary
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'City': ['New York', 'San Francisco', 'Seattle', 'Chicago']}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Seattle
3,David,40,Chicago


### Reading Data

In [19]:
# Reading a CSV file
df_grade = pd.read_csv('grade.csv')

### Data Information

In [26]:
# Returns a tuple representing the dimensions of the DataFrame (rows, columns)
df_grade.shape

(67, 7)

In [20]:
# Viewing the first few rows
df_grade.head()

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science
0,101,1,20,0.0,16.0,6.0,15
1,102,1,16,7.0,20.0,14.0,16
2,103,1,10,5.0,3.0,10.0,14
3,105,1,20,6.0,7.0,15.0,12
4,106,1,20,18.0,12.0,9.0,13


In [21]:
# Viewing the last few rows
df_grade.tail()

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science
62,177,5,14,9.0,3.0,1.0,20
63,178,5,20,20.0,1.0,20.0,16
64,179,5,9,7.0,3.0,20.0,12
65,183,5,14,8.0,2.0,8.0,15
66,184,5,19,11.0,12.0,11.0,17


In [25]:
# Display information about the DataFrame
df_grade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   student           67 non-null     int64  
 1   year_academic     67 non-null     int64  
 2   math              67 non-null     int64  
 3   science           66 non-null     float64
 4   history           65 non-null     float64
 5   lenguage          66 non-null     float64
 6   computer_science  67 non-null     int64  
dtypes: float64(3), int64(4)
memory usage: 3.8 KB


In [27]:
# Retrieves column labels of the DataFrame
df_grade.columns

Index(['student', 'year_academic', 'math', 'science', 'history', 'lenguage',
       'computer_science'],
      dtype='object')

### Data Selection

In [33]:
# Selecting a column
df_grade['student']

0     101
1     102
2     103
3     105
4     106
     ... 
62    177
63    178
64    179
65    183
66    184
Name: student, Length: 67, dtype: int64

In [32]:
# Selecting a column
df_grade.student

0     101
1     102
2     103
3     105
4     106
     ... 
62    177
63    178
64    179
65    183
66    184
Name: student, Length: 67, dtype: int64

In [23]:
# Selecting multiple columns
df_grade[['math', 'computer_science']]

Unnamed: 0,math,computer_science
0,20,15
1,16,16
2,10,14
3,20,12
4,20,13
...,...,...
62,14,20
63,20,16
64,9,12
65,14,15


In [41]:
# Selecting rows using slicing
df_grade[1:6]

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science
1,102,1,16,7.0,20.0,14.0,16
2,103,1,10,5.0,3.0,10.0,14
3,105,1,20,6.0,7.0,15.0,12
4,106,1,20,18.0,12.0,9.0,13
5,107,1,20,5.0,9.0,4.0,14


In [40]:
# Select the value at row 1 (second row) and column 2 (third column)
df_grade.iloc[1, 2]

16

### Data Manipulation

In [45]:
# Creating a new column with a scalar value
df_grade['test'] = 'value_test'
df_grade.head()

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science,test
0,101,1,20,0.0,16.0,6.0,15,value_test
1,102,1,16,7.0,20.0,14.0,16,value_test
2,103,1,10,5.0,3.0,10.0,14,value_test
3,105,1,20,6.0,7.0,15.0,12,value_test
4,106,1,20,18.0,12.0,9.0,13,value_test


In [46]:
# Creating a new column based on an existing column
df_grade['final_grade'] = (df_grade['math'] + df_grade['science'] + df_grade['history'] + df_grade['lenguage'] + df_grade['computer_science']) / 5
df_grade.head()

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science,test,final_grade
0,101,1,20,0.0,16.0,6.0,15,value_test,11.4
1,102,1,16,7.0,20.0,14.0,16,value_test,14.6
2,103,1,10,5.0,3.0,10.0,14,value_test,8.4
3,105,1,20,6.0,7.0,15.0,12,value_test,12.0
4,106,1,20,18.0,12.0,9.0,13,value_test,14.4


In [47]:
# Creating a new column using apply() with a function
def status(row):
    if row['final_grade'] >= 11:
        return 'passed'
    else:
        return 'failed'

df_grade['status'] = df_grade.apply(status, axis=1)
df_grade.head()

Unnamed: 0,student,year_academic,math,science,history,lenguage,computer_science,test,final_grade,status
0,101,1,20,0.0,16.0,6.0,15,value_test,11.4,passed
1,102,1,16,7.0,20.0,14.0,16,value_test,14.6,passed
2,103,1,10,5.0,3.0,10.0,14,value_test,8.4,failed
3,105,1,20,6.0,7.0,15.0,12,value_test,12.0,passed
4,106,1,20,18.0,12.0,9.0,13,value_test,14.4,passed
