# Introduction to Pandas (Series, DataFrame basics)


In [128]:
import pandas as pd
import numpy as np
import random 

1. **Create a Pandas Series from a Python list, numpy array, and a dictionary.**

In [8]:
# 1. Making Pandas Series from Python List

print(f"\n Pandas Series from Python List")
print("="*20,"\n")

pythonList = [1,2,3,4,5,6,7,8,9,10]
ser1 = pd.Series(pythonList)
print(ser1)

# 2. Making Pandas Series from Numpy Array

print(f"\n Pandas Series from Numpy Array")
print("="*20,"\n")

numpyArray = np.arange(10)
ser2 = pd.Series(numpyArray)
print(ser2)

# 3. Making Pandas Series from Dictionary

print(f"\n Pandas Series from Dictionary")
print("="*20,"\n")
dictionary = {"red":1,"green":2,"blue":3,}
ser3 = pd.Series(dictionary)
print(ser3)
# print(ser3.iloc[0]) # Another way to access dictionary element using index



 Pandas Series from Python List

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

 Pandas Series from Numpy Array

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

 Pandas Series from Dictionary

red      1
green    2
blue     3
dtype: int64
1


2. **Assign a custom index to the Series.**

In [20]:
randomList = [1,2,3,4,5]

labelledSeries = pd.Series(randomList , index = ['a','b','c','d','e'])
#NOTE : The lenght of index should match the lenght of elements in Series
print(labelledSeries) #printing whole sereis
print(f"\nThe Value At Index C :{labelledSeries["c"]}") #printing using custom label

a    1
b    2
c    3
d    4
e    5
dtype: int64

The Value At Index C :3


3. **Perform basic arithmetic operations on Series.**

In [28]:
series1 = pd.Series([1,2,3,4,5])
series2 = pd.Series ([6,7,8,9,10])

# Adding 2 Series
print(f"\nSum Operation on 2 series\n")
add_series = series1+series2
print(add_series)

# Subtracting 2 Series
print(f"\nSubtraction Operation on 2 series\n")
sub_series = series1-series2
print(sub_series)

# Multiplying 2 Series
print(f"\nMultiplication Operation on 2 series\n")
mul_series = series1*series2
print(mul_series)

# Dividing 2 Series
print(f"\nDivision Operation on 2 series\n")
div_series = series1/series2
print(div_series)




Sum Operation on 2 series

0     7
1     9
2    11
3    13
4    15
dtype: int64

Subtraction Operation on 2 series

0   -5
1   -5
2   -5
3   -5
4   -5
dtype: int64

Multiplication Operation on 2 series

0     6
1    14
2    24
3    36
4    50
dtype: int64

Division Operation on 2 series

0    0.166667
1    0.285714
2    0.375000
3    0.444444
4    0.500000
dtype: float64


4. ***Access elements using index labels and positions.***

In [40]:
array = np.random.randint(1,10,10)
series1 = pd.Series(array, index = ["a","b","c","d","e","f","g","h","i","j"])
print(series1)

# Using .loc[] function to access element using label
print(f"\nPrint By First Method:{series1.loc["c"]}") # First Method
print(f"\nPrint By Second Method: {series1["c"]}\n") #2nd Method

#Using .iloc function to access element using index
print(f"\nPrint By First Method: {series1.iloc[0]}") #First Method (Recommended)
print(f"\nPrint By First Method: {series1[0]}") #Second Method (Going to depreciated)

a    2
b    7
c    2
d    7
e    8
f    9
g    9
h    2
i    4
j    1
dtype: int32

Print By First Method:2

Print By Second Method: 2


Print By First Method: 2

Print By First Method: 2


  print(f"\nPrint By First Method: {series1[0]}") #Second Method (Going to depreciated)


5. ***Filter the Series to include only values greater than a specific threshold.***


In [44]:
print(f"\nOrigrnal Series\n")
array = np.random.randint(5,15,20)
series = pd.Series(array)
print(series)
print(f"\nFiltered Series (Number >= 10)\n")

filteredSeries = series[series >= 10]
print(filteredSeries)


Origrnal Series

0      9
1      8
2     11
3      9
4     14
5      5
6     12
7      7
8     10
9      9
10     5
11    12
12     8
13     8
14    13
15     7
16     8
17     8
18     9
19     6
dtype: int32

Filtered Series (Number >= 10)

2     11
4     14
6     12
8     10
11    12
14    13
dtype: int32


6. ***Create a DataFrame from a dictionary of lists.***

In [55]:
#Dictionary Of Lists
listDictionary = {"Names":["Ahsan","Umer","Anas"],
                 "Cars":["Lamborghini","Bugatti","Rolls Royce"]
                 }
print(f"\n Method 1\n")
dataframe = pd.DataFrame.from_dict(listDictionary) #Method 1
print(dataframe)

print(f"\n Method 2\n")
dataframe = pd.DataFrame.from_records(listDictionary , index = ['a','b','c']) #Method 2
print(dataframe)


 Method 1

   Names         Cars
0  Ahsan  Lamborghini
1   Umer      Bugatti
2   Anas  Rolls Royce

 Method 2

          Cars  Names
a  Lamborghini  Ahsan
b      Bugatti   Umer
c  Rolls Royce   Anas


7. ***Create a DataFrame from a numpy array, specifying column and index names.***

In [67]:
print("\nPrinting Whole Dataframe")
nparray = np.random.randint(5,10,16).reshape(4,4)
df = pd.DataFrame.from_records(nparray,index=['Phy Test','Eng Test','Math Test','Chem Test'],columns = ["Class 1 Avg.","Class2 Avg.","Class3 Avg","Class4 Avg."])
print(df) #Printing Whole Dataframe

print("\nPrinting Only Phy Test Avg of class 2")
print(df.loc['Phy Test','Class2 Avg.'])


Printing Whole Dataframe
           Class 1 Avg.  Class2 Avg.  Class3 Avg  Class4 Avg.
Phy Test              6            5           5            7
Eng Test              5            8           6            9
Math Test             8            7           7            6
Chem Test             9            7           8            9

Printing Only Phy Test Avg of class 2
5


8. ***Load a DataFrame from a CSV file.***  
I found an Student_Performance dataset from internet.It includes features like Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers 

In [70]:
data = pd.read_csv("Student_Performance.csv")
print(data)

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0                 7               99                        Yes            9   
1                 4               82                         No            4   
2                 8               51                        Yes            7   
3                 5               52                        Yes            5   
4                 7               75                         No            8   
...             ...              ...                        ...          ...   
9995              1               49                        Yes            4   
9996              7               64                        Yes            8   
9997              6               83                        Yes            8   
9998              9               97                        Yes            7   
9999              7               74                         No            8   

      Sample Question Papers Practiced 

9. ***Display the first and last five rows of the DataFrame.***

In [73]:
print("\n First Five Rows\n")
print(data.head())
print("\n Last Five Rows\n")
print(data.tail())


 First Five Rows

   Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7               99                        Yes            9   
1              4               82                         No            4   
2              8               51                        Yes            7   
3              5               52                        Yes            5   
4              7               75                         No            8   

   Sample Question Papers Practiced  Performance Index  
0                                 1               91.0  
1                                 2               65.0  
2                                 2               45.0  
3                                 2               36.0  
4                                 5               66.0  

 Last Five Rows

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
9995              1               49                        Yes            4  

10. **Get a summary of the DataFrame including the mean, median, and standard deviation of numeric columns.**

In [90]:
#Method 1
print("\n Using Describe Function \n")
print(data.describe()) #Note 50% in Describe Function output is also median


#Method 2
print("\nUsing Median Funciton To Find Median Seperately\n")
# Seperating Numeric Data From My Dataset So That I Can Find Median
numericData = data.select_dtypes(include = 'number')
print(numericData.median())


 Using Describe Function 

       Hours Studied  Previous Scores   Sleep Hours  \
count   10000.000000     10000.000000  10000.000000   
mean        4.992900        69.445700      6.530600   
std         2.589309        17.343152      1.695863   
min         1.000000        40.000000      4.000000   
25%         3.000000        54.000000      5.000000   
50%         5.000000        69.000000      7.000000   
75%         7.000000        85.000000      8.000000   
max         9.000000        99.000000      9.000000   

       Sample Question Papers Practiced  Performance Index  
count                      10000.000000       10000.000000  
mean                           4.583300          55.224800  
std                            2.867348          19.212558  
min                            0.000000          10.000000  
25%                            2.000000          40.000000  
50%                            5.000000          55.000000  
75%                            7.000000          

11. ***Extract a specific column as a Series.***

In [99]:
#Using Student Performance Dataset
# Making Series of Column "Hours Studied"
print("\n Orignal Column\n")
print(data["Hours Studied"])
print("\n Series\n")
series = pd.Series(data["Hours Studied"])
print(series)


 Orignal Column

0       7
1       4
2       8
3       5
4       7
       ..
9995    1
9996    7
9997    6
9998    9
9999    7
Name: Hours Studied, Length: 10000, dtype: int64

 Series

0        7.0
1        4.0
2        8.0
3        5.0
4        7.0
        ... 
9996     7.0
9997     6.0
9998     9.0
9999     7.0
10000    NaN
Name: Hours Studied, Length: 10001, dtype: float64


12. ***Filter rows based on column values.***

In [107]:
#Using Student Performance Dataset
# Filterd Rows based on columns value (Condition : Sleep Hours is equal to 5)
print(f"\n Filtered Data \n")
filteredData = data[data["Sleep Hours"]==5] #This will bring all the records in which sleep hours == 5

print(filteredData)


 Filtered Data 

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
3                 5               52                        Yes            5   
6                 7               73                        Yes            5   
23                1               85                         No            5   
39                9               68                         No            5   
57                2               70                        Yes            5   
...             ...              ...                        ...          ...   
9936              7               58                         No            5   
9940              8               95                         No            5   
9982              8               51                         No            5   
9985              8               99                         No            5   
9989              3               46                         No            5   

      Sample Question

13. **Select rows based on multiple conditions.**


In [122]:
#Using Student Performance Dataset
print("\nFiltered Data\n")
# This Line will filtered data on basis of sleep hours and performance index
filteredData = data[(data["Sleep Hours"]==5) & (data["Performance Index"]>80)] 
print(filteredData)


Filtered Data

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
72                9               82                        Yes            5   
131               9               89                        Yes            5   
135               7               96                         No            5   
256               6               93                        Yes            5   
310               8               91                         No            5   
...             ...              ...                        ...          ...   
9847              5               94                        Yes            5   
9851              5               94                         No            5   
9889              8               88                         No            5   
9940              8               95                         No            5   
9985              8               99                         No            5   

      Sample Question P

14. **Add a new column to the DataFrame.**


In [136]:
print("\n Updated Data")
# Now this line will add another column of name 'Gender'
data["Gender"] = ['male' if random.random() > 0.5 else 'female' for i in range (len(data))] # This line will randomly select male and female (probablity of selection is 50%)
print(data)


 Updated Data
      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0                 7               99                        Yes            9   
1                 4               82                         No            4   
2                 8               51                        Yes            7   
3                 5               52                        Yes            5   
4                 7               75                         No            8   
...             ...              ...                        ...          ...   
9995              1               49                        Yes            4   
9996              7               64                        Yes            8   
9997              6               83                        Yes            8   
9998              9               97                        Yes            7   
9999              7               74                         No            8   

      Sample Question Pa

14. **Delete a column from the DataFrame.**

In [142]:

data = data.drop("Gender",axis=1)# This Line will drop the column named "Gender". Axis=1 tells that the whole column should be dropped
data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


16. **Rename columns in the DataFrame.**

In [150]:
print("\nBefore Renaming the 'Hours Studied' column\n")
print(data.head())

print("\nAfter Renaming the 'Hours Studied' column\n")
data.rename(columns = {"Hours Studied":"Studied Hours"},inplace = True)# Inplace tells the modification should done in copy of dataset(inplace = False) or orignal dataset (inplace = True)
print(data.head())


 Before Renaming the 'Hours Studied' column

   Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7               99                        Yes            9   
1              4               82                         No            4   
2              8               51                        Yes            7   
3              5               52                        Yes            5   
4              7               75                         No            8   

   Sample Question Papers Practiced  Performance Index  
0                                 1               91.0  
1                                 2               65.0  
2                                 2               45.0  
3                                 2               36.0  
4                                 5               66.0  

 After Renaming the 'Hours Studied' column

   Studied Hours  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7           

# Student Grade Analysis

In [211]:
#randomly producing Marks Array
marksArray = np.random.randint(40,90,10)
print(marksArray)
#Creating Name Array
namesArray = ["Ahsan","Umer","Abdullah","Anas","Arqam","Ashir","Abdur Rehman","Habib","Rauhan","Hassan"]
#Making Dataframe
customData = pd.DataFrame(namesArray,columns = ["Names"])
customData["Marks"] = marksArray 
customData.head()

# Randomly producing Assignments columns
customData["Assignments"] = ["yes" if random.random()>0.5 else "no" for i in range(len(customData))]
print(customData.head())

print("\n After Giving Extra 5 Marks to Those Who Complete Assignments\n")
# Now Adding 5 Marks Extra if they done Assignments . I am using lambda funciton for this
customData["Marks"] = customData.apply(lambda record : record["Marks"]+5 if record["Assignments"]=='yes' else record['Marks'],axis=1)
print(customData.head())

print("\n Now Finding the top performers , whose marks are greater than 90\n")
filteredData = customData[customData["Marks"] > 90]
if (len(filteredData)>0):
    print(filteredData)
else:
    print("No Student in this category")


[74 56 46 50 60 78 49 44 72 89]
      Names  Marks Assignments
0     Ahsan     74          no
1      Umer     56         yes
2  Abdullah     46         yes
3      Anas     50         yes
4     Arqam     60         yes

 After Giving Extra 5 Marks to Those Who Complete Assignments

      Names  Marks Assignments
0     Ahsan     74          no
1      Umer     61         yes
2  Abdullah     51         yes
3      Anas     55         yes
4     Arqam     65         yes

 Now Finding the top performers , whose marks are greater than 90

    Names  Marks Assignments
9  Hassan     94         yes
