In [3]:
import pandas as pd

# Creating a Series from a list
series = pd.Series([10, 20, 30, 40, 50])
print(series)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [4]:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'Los Angeles', 'Chicago']}
df = pd.DataFrame(data)
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [5]:
df.head()
##The head() method returns the first five rows of the DataFrame. This is useful for quickly examining the top of a dataset.

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [6]:
df.info()
###The info() method provides a summary of the DataFrame, including the number of non-null entries and the data type of each column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [7]:
df.describe()
###The describe() method generates descriptive statistics, such as mean, standard deviation, min, and max, for numerical columns.

Unnamed: 0,Age
count,3.0
mean,30.0
std,5.0
min,25.0
25%,27.5
50%,30.0
75%,32.5
max,35.0


In [8]:
print(df['Name'])
###You can select a single column from a DataFrame using the column name. This returns a Series object.

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object


In [11]:
filtered_df=df[df['Age']>30]
print(filtered_df)
#You can filter rows based on a condition.
#  Here, only rows where the 'Age' column is greater than 30 are selected.

      Name  Age     City
2  Charlie   35  Chicago


In [14]:
###select specific rows and coloumn
selected_data=df.loc[0:1,['Name','City']]
print(selected_data)
#The loc[] method is used for label-based indexing to select specific rows and columns. 
# Here, rows with indices 0 and 1 and columns 'Name' and 'City' are selected.

##loc[row , col] 


    Name         City
0  Alice     New York
1    Bob  Los Angeles


In [15]:
df['salary']=[70000,80000,90000]
print(df)
##modifying data frame by adding coloumns

      Name  Age         City  salary
0    Alice   25     New York   70000
1      Bob   30  Los Angeles   80000
2  Charlie   35      Chicago   90000


In [17]:
df['Age']+=1
print(df)
##updating coloumn values
#This updates the values in the 'Age' column 
# by adding 1 to each value. Pandas allows vectorized operations, making this efficient.

      Name  Age         City  salary
0    Alice   27     New York   70000
1      Bob   32  Los Angeles   80000
2  Charlie   37      Chicago   90000


In [20]:
#dropping coloumns
df=df.drop('City',axis=1)
print(df)
##try put axis zero
df=df.drop('City',axis=0)
print(df)
##it wil give error because axis 0 means row and there is no row called city

      Name  Age  salary
0    Alice   27   70000
1      Bob   32   80000
2  Charlie   37   90000


In [27]:
df_with_nan = pd.DataFrame({'A': [1, 2, None], 'B': [4, None, 6]})
print(df_with_nan.isnull())
##detecting missing values

       A      B
0  False  False
1  False   True
2   True  False


In [28]:
df_with_nan = pd.DataFrame({'A': [1, 2, None], 'B': [None, None, 6]})
df_with_nan

Unnamed: 0,A,B
0,1.0,
1,2.0,
2,,6.0


In [29]:
df_with_nan.isnull()

Unnamed: 0,A,B
0,False,True
1,False,True
2,True,False


In [30]:
df_with_nan.isna()

Unnamed: 0,A,B
0,False,True
1,False,True
2,True,False


In [31]:
df_with_nan.isnull().sum()

A    1
B    2
dtype: int64

In [32]:
df_with_nan.isnull().sum().sum()

np.int64(3)

Filling Missing Values 

In [33]:
filled_df = df_with_nan.fillna(0)
print(filled_df)


     A    B
0  1.0  0.0
1  2.0  0.0
2  0.0  6.0


In [38]:
df_with_nan = pd.DataFrame({'A': [1, 2, None], 'B': [None, None, 6]})
df_with_nan['A']=df_with_nan['A'].fillna(df_with_nan['A'].mean())
df_with_nan['B']=df_with_nan['B'].fillna(df_with_nan['B'].mean())
print(df_with_nan)

     A    B
0  1.0  6.0
1  2.0  6.0
2  1.5  6.0


Dropping rows with missing values

In [40]:
df_with_nan = pd.DataFrame({'A': [1, 2, None], 'B': [4, None, 6]})
cleaned_df=df_with_nan.dropna()
print(cleaned_df)
#try axis 1 it will give error as it will be coloumns

     A    B
0  1.0  4.0


Grouping and aggregation

In [43]:
df = pd.DataFrame({'Department': ['HR', 'IT', 'HR', 'IT'],
                   'Salary': [50000, 60000, 45000, 80000]})
grouped = df.groupby('Department').mean()
print(grouped)

             Salary
Department         
HR          47500.0
IT          70000.0


Applying custom functions with apply()


In [44]:
def double_salary(x):
    return x * 2

df['Double Salary'] = df['Salary'].apply(double_salary)
print(df)

  Department  Salary  Double Salary
0         HR   50000         100000
1         IT   60000         120000
2         HR   45000          90000
3         IT   80000         160000


In [46]:
grouped=df.groupby('Department').mean()
print(grouped)

             Salary  Double Salary
Department                        
HR          47500.0        95000.0
IT          70000.0       140000.0


practical examples

In [74]:
import pandas as pd
# Load the CSV file into a Pandas DataFrame
df_grades= pd.read_csv('Pandas\Pandas\Pandas lec\Data\Grades_Short.csv')
df_grades.head()

  df_grades= pd.read_csv('Pandas\Pandas\Pandas lec\Data\Grades_Short.csv')


Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62


In [75]:
df_grades.head(3)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625


In [76]:
df_grades.shape

(7, 10)

In [77]:
df_grades.dtypes

Name               object
Previous_Part     float64
Participation1      int64
Mini_Exam1        float64
Mini_Exam2          int64
Participation2      int64
Mini_Exam3        float64
Final             float64
Grade              object
ID                  int64
dtype: object

In [78]:
#get coloumn name
df_grades.columns

Index(['Name', 'Previous_Part', 'Participation1', 'Mini_Exam1', 'Mini_Exam2',
       'Participation2', 'Mini_Exam3', 'Final', 'Grade', 'ID'],
      dtype='object')

In [21]:
#get row names
df_grades.index

RangeIndex(start=0, stop=7, step=1)

In [79]:
#get specific coloumn
df_grades['Name']

0     Jake
1      Joe
2    Susan
3      Sol
4    Chris
5    Tarik
6    Malik
Name: Name, dtype: object

In [80]:
df_grades.Name

0     Jake
1      Joe
2    Susan
3      Sol
4    Chris
5    Tarik
6    Malik
Name: Name, dtype: object

In [81]:
#select multiple coloumns
df_grades[['Name','Grade']]

Unnamed: 0,Name,Grade
0,Jake,A
1,Joe,A
2,Susan,A-
3,Sol,A
4,Chris,A
5,Tarik,B
6,Malik,A


In [82]:
#storing result
names=df_grades.Name
print(names)
#now name is a series

0     Jake
1      Joe
2    Susan
3      Sol
4    Chris
5    Tarik
6    Malik
Name: Name, dtype: object


In [83]:
#slicing a series
names[[0,1,4]]

0     Jake
1      Joe
4    Chris
Name: Name, dtype: object

In [84]:
names[1:5]

1      Joe
2    Susan
3      Sol
4    Chris
Name: Name, dtype: object

Slicing data frame

In [85]:
first_name=df_grades.loc[0,'Name']
first_name

'Jake'

In [86]:
first_row=df_grades.loc[0,]
first_row

Name               Jake
Previous_Part      32.0
Participation1        1
Mini_Exam1         19.5
Mini_Exam2           20
Participation2        1
Mini_Exam3         10.0
Final              33.0
Grade                 A
ID                90743
Name: 0, dtype: object

In [87]:
slice_one=df_grades.loc[0:2,'Name':'Mini_Exam2']
slice_one
###end points are inclusive

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2
0,Jake,32.0,1,19.5,20
1,Joe,32.0,1,20.0,16
2,Susan,30.0,1,19.0,19


In [88]:
##arbitary chunk
slice=df_grades.loc[[0,2,4],['Name','Mini_Exam2','Grade']]
slice

Unnamed: 0,Name,Mini_Exam2,Grade
0,Jake,20,A
2,Susan,19,A-
4,Chris,17,A


Built in Functions

In [45]:
df_grades.Final.mean()
##avreage score using mean built in function

np.float64(32.214285714285715)

In [46]:
max_mini1=df_grades.Mini_Exam1.max()
max_mini1

np.float64(22.0)

In [48]:
summary_df=df_grades.describe()
summary_df
##summare_df is a date frame describing all coloumns

Unnamed: 0,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,ID
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,31.071429,1.0,19.785714,17.857143,1.0,11.0,32.214286,29111.0
std,0.838082,0.0,1.074598,2.734262,0.0,2.217356,3.828154,41131.08167
min,30.0,1.0,19.0,13.0,1.0,8.0,24.0,62.0
25%,30.5,1.0,19.0,16.5,1.0,9.5,32.5,4260.5
50%,31.0,1.0,19.5,19.0,1.0,10.5,33.0,7625.0
75%,31.75,1.0,20.0,19.5,1.0,12.75,33.75,48413.0
max,32.0,1.0,22.0,21.0,1.0,14.0,36.0,90743.0


In [50]:
summary_df[['Mini_Exam1','Final']]

Unnamed: 0,Mini_Exam1,Final
count,7.0,7.0
mean,19.785714,32.214286
std,1.074598,3.828154
min,19.0,24.0
25%,19.0,32.5
50%,19.5,33.0
75%,20.0,33.75
max,22.0,36.0


In [51]:
count=df_grades.Grade.value_counts()
count
#value_counts built in functions get the number of times of each unique value apperas ins column and return a series

Grade
A     5
A-    1
B     1
Name: count, dtype: int64

In [52]:
count['A']

np.int64(5)

In [54]:
df_grades['Grade'].unique()

array(['A', 'A-', 'B'], dtype=object)

Creating  new Coloumns

In [92]:
df_grades['New Column']=1
df_grades.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID,New Column
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743,1
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284,1
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625,1
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237,1
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62,1


We can create a coloumn function of another coloumn

In [93]:
df_grades['Final Precentage']=(df_grades['Final']/36)*100
df_grades.head()
#we assume final mark from 36 and made a final percentage coloumn

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID,New Column,Final Precentage
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743,1,91.666667
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284,1,88.888889
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625,1,91.666667
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237,1,94.444444
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62,1,93.055556


delete singl coloumn


In [94]:
del df_grades['Final Precentage']
df_grades.head()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID,New Column
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743,1
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284,1
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625,1
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237,1
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62,1


delete multiple coloumns

In [95]:
df_grades.drop(['New Column','Participation1'], axis=1, inplace=True)
df_grades.head()

Unnamed: 0,Name,Previous_Part,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,20.0,16,1,14.0,32.0,A,7284
2,Susan,30.0,19.0,19,1,10.5,33.0,A-,7625
3,Sol,31.0,22.0,13,1,13.0,34.0,A,1237
4,Chris,30.0,19.0,17,1,12.5,33.5,A,62


Missing Data

In [102]:
df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv')
df_missing

  df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv')


Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,not available,1,12.5,33.5,A,72


In [103]:
df_missing.dtypes

Name               object
Previous_Part     float64
Participation1      int64
Mini_Exam1        float64
Mini_Exam2         object
Participation2      int64
Mini_Exam3        float64
Final             float64
Grade              object
Temp                int64
dtype: object

In [106]:
df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv', na_values=['NaN','not available'])
df_missing

  df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv', na_values=['NaN','not available'])


Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,,1,12.5,33.5,A,72


In [109]:
df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv', na_values={"Mini_Exam2":"not available","Participation1":-1})
df_missing

  df_missing=pd.read_csv('Pandas\Pandas\Pandas lec\Data\Missing_Data.csv', na_values={"Mini_Exam2":"not available","Participation1":-1})


Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72


In [110]:
df_missing.isnull()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,True,False,False,False,False,False


In [111]:
df_missing.isnull().sum()

Name              0
Previous_Part     1
Participation1    1
Mini_Exam1        0
Mini_Exam2        1
Participation2    0
Mini_Exam3        0
Final             0
Grade             0
Temp              0
dtype: int64

In [112]:
df_missing.dropna(axis=0, inplace=False)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34


In [113]:
df_missing.fillna(0, inplace=False)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,0.0,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,0.0,19.0,0.0,1,12.5,33.5,A,72


In [115]:
mean_temp=df_missing.Temp.mean()
df_missing.fillna({'Temp':mean_temp},inplace=False)
df_missing

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72
