Reading a csv file from the root folder

In [23]:
import pandas as pd
df = pd.read_csv('aug_train.csv')
df.head()


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


Reading a csv file from a server/url

In [21]:
import requests
from io import StringIO
url = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
response = requests.get(url)
data = StringIO(response.text)
pd.read_csv(data)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA
...,...,...
189,Paraguay,SOUTH AMERICA
190,Peru,SOUTH AMERICA
191,Suriname,SOUTH AMERICA
192,Uruguay,SOUTH AMERICA


Loading a tsv (tab separated file) file. By default, read_csv() method is separated by ',' which can be found by pressing shift + tab on csv files. so we need to put sep='\t' to correctly read the tsv file

In [37]:
df = pd.read_csv('../100daysofML/movie_titles_metadata.tsv', sep='\t')
df.head()

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...


### If we take a closer look at the dataset, there is a problem.
The first row is automatically assigned as column name instead of some name for the columns

In [45]:
df = pd.read_csv('../100daysofML/movie_titles_metadata.tsv', sep='\t', names=["Sno.", "Name", "Release Year", "Ratings", "Votes", "Genres"])
df.head()

Unnamed: 0,Sno.,Name,Release Year,Ratings,Votes,Genres
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']


# Header Params

Header params are used in case if header_names of your dataset are treated as rows (first)

In [56]:
df = pd.read_csv('test.csv', header=1)
df.head()

Unnamed: 0,0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
1,2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
2,3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
3,4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


# usecols params

takes a list of column names as param value and only loads those columns. Deselects all other columns, useful when you only wants to select some specific columns

In [60]:
df = pd.read_csv('test.csv', header=1, usecols=["enrollee_id", "gender", "education_level"])
df.head()

Unnamed: 0,enrollee_id,gender,education_level
0,29725,Male,Graduate
1,11561,,Graduate
2,33241,,Graduate
3,666,Male,Masters


# nrows

only loads given no. of rows, handy when working with larger datasets and you only need few hundreds or thousands of them.

In [110]:
import requests
from io import StringIO
url = 'https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv'
response = requests.get(url)
data = StringIO(response.text)
pd.read_csv(data, nrows=99)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA
...,...,...
94,United Arab Emirates,ASIA
95,Uzbekistan,ASIA
96,Vietnam,ASIA
97,Yemen,ASIA


# Encoding Parameter

first find the encoded format from any text editor and put the same value in encoding param. 

In [142]:
from charset_normalizer import from_path
result = from_path('zomato.csv').best()
result.encoding

'cp775'

In [143]:
df = pd.read_csv('zomato.csv', encoding='cp775')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


### error_bad_lines= skips the rows that contains incorrect no. of columns or that causes inconsistencies in your datasets. 
For eg: few rows in your dataset has 9 columns instead of 8. In such case, these rows are ommited.

### dtype: converts the default data types of columns to enforced data types. Takes dictionary as values

In [160]:
pd.read_csv('aug_train.csv', usecols=["training_hours", "target"], dtype={'training_hours': float, 'target': int})

Unnamed: 0,training_hours,target
0,36.0,1
1,47.0,0
2,83.0,0
3,52.0,1
4,8.0,0
...,...,...
19153,42.0,1
19154,52.0,1
19155,44.0,0
19156,97.0,0


### Parsing Dates
Dates are very important columns in pandas, initially loaded as object. In order to add filters or change its format or handle incorrect or Nan dates, we need to convert it to datetime format. We can do so by two methods.
1. Directly using parse_date=["date"] parameters while loading the dataset. This will require you to know the name of the date column in advance
2. Using pandas pd.to_datetime() functionality, this will give additional perks to choose the format and handle errors: for eg: format="%m-%d-%y", errors=coerce 

In [187]:
df = pd.read_csv('IPL Matches 2008-2020.csv', parse_dates=['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               816 non-null    int64         
 1   city             803 non-null    object        
 2   date             816 non-null    datetime64[ns]
 3   player_of_match  812 non-null    object        
 4   venue            816 non-null    object        
 5   neutral_venue    816 non-null    int64         
 6   team1            816 non-null    object        
 7   team2            816 non-null    object        
 8   toss_winner      816 non-null    object        
 9   toss_decision    816 non-null    object        
 10  winner           812 non-null    object        
 11  result           812 non-null    object        
 12  result_margin    799 non-null    float64       
 13  eliminator       812 non-null    object        
 14  method           19 non-null     object   

In [167]:
df = pd.read_csv('IPL Matches 2008-2020.csv')
df.head()

Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,,Asad Rauf,RE Koertzen
1,335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,,MR Benson,SL Shastri
2,335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,N,,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,,SJ Davis,DJ Harper
4,335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,N,,BF Bowden,K Hariharan


In [183]:
df['parsed_dates'] = pd.to_datetime(df.date, format="%Y-%m-%d", errors='coerce')
df['parsed_dates'].dt.day  # Only returns days when the matches where played

0      18
1      19
2      19
3      20
4      20
       ..
811    28
812     5
813     6
814     8
815    10
Name: parsed_dates, Length: 816, dtype: int32

### na_values=: useful when the values are NaN but are identified as '-' or '%' or '/' in the datasets

In [192]:
from charset_normalizer import from_path
result = from_path('zomato.csv').best()
result.encoding

'cp775'

In [202]:
pd.read_csv('zomato.csv', encoding='cp775', na_values={'Country Code': 162})
# Any rows that has a Country Code of 162 is now converted into a NaN values. It will make us easier either to drop of fill these rows later.

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.584450,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9546,5915730,Naml█▒ Gurme,208.0,█┴stanbul,"Kemankeü¶ Karamustafa Paü¶a Mahallesi, R█▒ht█▒...",KarakĒ_y,"KarakĒ_y, █┴stanbul",28.977392,41.022793,Turkish,...,Turkish Lira(TL),No,No,No,No,3,4.1,Green,Very Good,788
9547,5908749,Ceviz A█¶ac█▒,208.0,█┴stanbul,"Koü¶uyolu Mahallesi, Muhittin ĒņstĒ_nda█¶ Cadd...",Koü¶uyolu,"Koü¶uyolu, █┴stanbul",29.041297,41.009847,"World Cuisine, Patisserie, Cafe",...,Turkish Lira(TL),No,No,No,No,3,4.2,Green,Very Good,1034
9548,5915807,Huqqa,208.0,█┴stanbul,"KuruĒ_eü¶me Mahallesi, Muallim Naci Caddesi, N...",KuruĒ_eü¶me,"KuruĒ_eü¶me, █┴stanbul",29.034640,41.055817,"Italian, World Cuisine",...,Turkish Lira(TL),No,No,No,No,4,3.7,Yellow,Good,661
9549,5916112,Aü¶ü¶k Kahve,208.0,█┴stanbul,"KuruĒ_eü¶me Mahallesi, Muallim Naci Caddesi, N...",KuruĒ_eü¶me,"KuruĒ_eü¶me, █┴stanbul",29.036019,41.057979,Restaurant Cafe,...,Turkish Lira(TL),No,No,No,No,4,4.0,Green,Very Good,901
