In [1]:
import pandas as pd
import numpy as np

### Data Cleaning

In [2]:
val1=np.array([4,np.nan,9,13])     ## create an array with NaN value as written

In [3]:
## We cannot add, subtract or multiply any number with. The result will be NaN

In [4]:
4+np.nan    

nan

In [5]:
8-np.nan

nan

In [6]:
12*np.nan

nan

In [7]:
val1

array([ 4., nan,  9., 13.])

In [8]:
np.nansum(val1)

26.0

In [9]:
## This adds all the values in an array ignoring NaN values

### NaN and None in Pandas

In [10]:
pd.Series([None,np.nan,56,1122])

0       NaN
1       NaN
2      56.0
3    1122.0
dtype: float64

### Operating on Null Values

In [11]:
## Create a Data Frame with Misssing values

In [12]:
raw_data = {'Student_id':[1, 2, 3, 4, np.nan, 6],'Student_name':['Nagesh','Anil','Saurabh','Anish',np.nan,np.nan],
            'Marks':[45, 56, np.nan, 89, 91, np.nan], 'Class':['X','VIII','IX',np.nan,np.nan,'IX'],
            'Division':['A','C',np.nan,'D','B',np.nan]}
df=pd.DataFrame(raw_data,columns=['Student_id','Student_name','Class','Division','Marks'])

In [13]:
df

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0
2,3.0,Saurabh,IX,,
3,4.0,Anish,,D,89.0
4,,,,B,91.0
5,6.0,,IX,,


### Dropping Null Values

In [14]:
df_no_missing=df.dropna() #Here we are dropping all values which have nan values
df_no_missing

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0


In [15]:
## As you can see it dropped all the rows which had NaN values

In [16]:
df_cleaned=df.dropna(how='all') #It means if there is any row which have all nan values this formula will drop the rows
df_cleaned

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0
2,3.0,Saurabh,IX,,
3,4.0,Anish,,D,89.0
4,,,,B,91.0
5,6.0,,IX,,


In [17]:
df['DOB'] = np.nan       ## Creates new column DOB with NaN values
df                 

Unnamed: 0,Student_id,Student_name,Class,Division,Marks,DOB
0,1.0,Nagesh,X,A,45.0,
1,2.0,Anil,VIII,C,56.0,
2,3.0,Saurabh,IX,,,
3,4.0,Anish,,D,89.0,
4,,,,B,91.0,
5,6.0,,IX,,,


In [18]:
df.dropna(axis=1,how='all')       ## Drops the columns which have all Null values

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0
2,3.0,Saurabh,IX,,
3,4.0,Anish,,D,89.0
4,,,,B,91.0
5,6.0,,IX,,


In [19]:
df

Unnamed: 0,Student_id,Student_name,Class,Division,Marks,DOB
0,1.0,Nagesh,X,A,45.0,
1,2.0,Anil,VIII,C,56.0,
2,3.0,Saurabh,IX,,,
3,4.0,Anish,,D,89.0,
4,,,,B,91.0,
5,6.0,,IX,,,


In [20]:
df.fillna(0)   ## Fills all null values with 0

Unnamed: 0,Student_id,Student_name,Class,Division,Marks,DOB
0,1.0,Nagesh,X,A,45.0,0.0
1,2.0,Anil,VIII,C,56.0,0.0
2,3.0,Saurabh,IX,0,0.0,0.0
3,4.0,Anish,0,D,89.0,0.0
4,0.0,0,0,B,91.0,0.0
5,6.0,0,IX,0,0.0,0.0


In [21]:
df['Marks'].fillna(df['Marks'].mean(),inplace=True)     ## Fills NaN values in Marks with the mean value
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Marks'].fillna(df['Marks'].mean(),inplace=True)     ## Fills NaN values in Marks with the mean value


Unnamed: 0,Student_id,Student_name,Class,Division,Marks,DOB
0,1.0,Nagesh,X,A,45.0,
1,2.0,Anil,VIII,C,56.0,
2,3.0,Saurabh,IX,,70.25,
3,4.0,Anish,,D,89.0,
4,,,,B,91.0,
5,6.0,,IX,,70.25,


In [22]:
df_cleaned

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0
2,3.0,Saurabh,IX,,
3,4.0,Anish,,D,89.0
4,,,,B,91.0
5,6.0,,IX,,


In [23]:
## Back-Fill method where it fills the Values of a row towards its previous row

In [24]:
df_cleaned.fillna(method='bfill') 

  df_cleaned.fillna(method='bfill')


Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1.0,Nagesh,X,A,45.0
1,2.0,Anil,VIII,C,56.0
2,3.0,Saurabh,IX,D,89.0
3,4.0,Anish,IX,D,89.0
4,6.0,,IX,B,91.0
5,6.0,,IX,,


In [25]:
## Forward Fill. In this the the rows fill in null values of the next row as it is

In [26]:
df.fillna(method='ffill',inplace=True)
df

  df.fillna(method='ffill',inplace=True)


Unnamed: 0,Student_id,Student_name,Class,Division,Marks,DOB
0,1.0,Nagesh,X,A,45.0,
1,2.0,Anil,VIII,C,56.0,
2,3.0,Saurabh,IX,C,70.25,
3,4.0,Anish,IX,D,89.0,
4,4.0,Anish,IX,B,91.0,
5,6.0,Anish,IX,B,70.25,


## Hierarchical Indexing

In [27]:
import pandas as pd

In [28]:
raw_data = {'Student_id':[1, 2, 3, 4, 5, 6],'Student_name':['Nagesh','Anil','Saurabh','Anish','Ketki','Vidisha'],
            'Marks':[45, 56, 64, 89, 91, 77], 'Class':['X','VIII','IX','VII','XI','IX'],
            'Division':['A','C','F','D','B','N']}
df=pd.DataFrame(raw_data,columns=['Student_id','Student_name','Class','Division','Marks'])

In [29]:
df

Unnamed: 0,Student_id,Student_name,Class,Division,Marks
0,1,Nagesh,X,A,45
1,2,Anil,VIII,C,56
2,3,Saurabh,IX,F,64
3,4,Anish,VII,D,89
4,5,Ketki,XI,B,91
5,6,Vidisha,IX,N,77


In [30]:
df.shape

(6, 5)

In [31]:
df.set_index(['Student_id','Student_name'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Student_id,Student_name,Class,Division,Marks
Student_id,Student_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Nagesh,1,Nagesh,X,A,45
2,Anil,2,Anil,VIII,C,56
3,Saurabh,3,Saurabh,IX,F,64
4,Anish,4,Anish,VII,D,89
5,Ketki,5,Ketki,XI,B,91
6,Vidisha,6,Vidisha,IX,N,77


In [32]:
## In the previous code we did not allow to drop the tables

In [33]:
df.set_index(['Student_id','Student_name'],drop=True,inplace=True)

In [34]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Class,Division,Marks
Student_id,Student_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Nagesh,X,A,45
2,Anil,VIII,C,56
3,Saurabh,IX,F,64
4,Anish,VII,D,89
5,Ketki,XI,B,91
6,Vidisha,IX,N,77


## Data Manipulation

In [35]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [36]:
EDA = pd.read_csv("https://raw.githubusercontent.com/Arkul2023/Meta-Scifor-Technologies/refs/heads/main/Datasets/eda_data.csv")

In [37]:
EDA.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,...,47,1,0,0,0,1,data scientist,na,2536,0
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,...,36,1,0,0,0,0,data scientist,na,4783,0
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,...,10,1,0,1,0,1,data scientist,na,3461,0
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,...,55,1,0,0,0,0,data scientist,na,3883,3
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,...,22,1,0,0,0,1,data scientist,na,2728,3


In [38]:
## Checking the unique values in a dataset

In [39]:
EDA.columns

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'same_state', 'age', 'python_yn', 'R_yn',
       'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len',
       'num_comp'],
      dtype='object')

In [40]:
EDA['excel'].unique()

array([1, 0], dtype=int64)

In [41]:
new_excel=pd.Categorical(EDA['excel'])
new_excel=new_excel.rename_categories(['Ones','Zeros'])
new_excel.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Ones,354,0.477089
Zeros,388,0.522911


In [42]:
EDA['num_comp'].unique()

array([0, 3, 2, 1, 4], dtype=int64)

In [43]:
num_comp=pd.Categorical(EDA['num_comp'],ordered=True)
num_comp=num_comp.rename_categories(['class0','class1','class2','class3','class4'])
num_comp.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
class0,460,0.619946
class1,12,0.016173
class2,41,0.055256
class3,228,0.307278
class4,1,0.001348


In [44]:
## Reassigning new name to num_comp

In [45]:
EDA['num_comp'] = num_comp

In [46]:
EDA

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,...,47,1,0,0,0,1,data scientist,na,2536,class0
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,...,36,1,0,0,0,0,data scientist,na,4783,class0
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,...,10,1,0,1,0,1,data scientist,na,3461,class0
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,...,55,1,0,0,0,0,data scientist,na,3883,class3
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,...,22,1,0,0,0,1,data scientist,na,2728,class3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,737,"Sr Scientist, Immuno-Oncology - Oncology",$58K-$111K (Glassdoor est.),Site Name: USA - Massachusetts - Cambridge\nPo...,3.9,GSK\n3.9,"Cambridge, MA","Brentford, United Kingdom",10000+ employees,1830,...,190,0,0,0,1,0,na,senior,6162,class3
738,738,Senior Data Engineer,$72K-$133K (Glassdoor est.),THE CHALLENGE\nEventbrite has a world-class da...,4.4,Eventbrite\n4.4,"Nashville, TN","San Francisco, CA",1001 to 5000 employees,2006,...,14,1,0,1,1,0,data engineer,senior,6130,class3
739,739,"Project Scientist - Auton Lab, Robotics Institute",$56K-$91K (Glassdoor est.),The Auton Lab at Carnegie Mellon University is...,2.6,Software Engineering Institute\n2.6,"Pittsburgh, PA","Pittsburgh, PA",501 to 1000 employees,1984,...,36,0,0,0,0,1,na,na,3078,class0
740,740,Data Science Manager,$95K-$160K (Glassdoor est.),Data Science ManagerResponsibilities:\n\nOvers...,3.2,"Numeric, LLC\n3.2","Allentown, PA","Chadds Ford, PA",1 to 50 employees,-1,...,-1,0,0,0,0,1,manager,na,1642,class0


In [47]:
## As you can see the name in num_comp column has been changed 

In [48]:
EDA.isna().sum()

Unnamed: 0           0
Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
hourly               0
employer_provided    0
min_salary           0
max_salary           0
avg_salary           0
company_txt          0
job_state            0
same_state           0
age                  0
python_yn            0
R_yn                 0
spark                0
aws                  0
excel                0
job_simp             0
seniority            0
desc_len             0
num_comp             0
dtype: int64

## Series Manipulation

In [49]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

In [56]:
ser1=pd.Series(data=[1, 2, 3, 4],index=['AB','BC','CD','DA'])
ser1

AB    1
BC    2
CD    3
DA    4
dtype: int64

In [60]:
## Prints data at particular index in below code

In [58]:
print(ser1[0])
print(ser1[2])

1
3


In [61]:
## Prints index by name of item/object

In [59]:
print(ser1["CD"])
print(ser1['DA'])

3
4


In [62]:
ser1

AB    1
BC    2
CD    3
DA    4
dtype: int64

In [68]:
print(ser1[:-1])

AB    1
BC    2
CD    3
dtype: int64
