In [2]:
import pandas as pd
import os

# Problem statement:

### There is a folder called Mandir Marg in this directory. For your convenience, I am also providing the google drive link to download it: http://bit.ly/2PY2sox

### What you have to do is combine all the data from this folder in a single dataframe. Each csv file has a file with a single column called 'value'. Make a new dataframe which holds these 'values' in a separate columns. Each column should be indexed by the name of unique entry it stores like 'BEN', 'CO', 'NOX' etc.

# Solution

In [4]:
os.chdir('Mandir Marg')

In [6]:
lis = os.listdir()
lis

['Mandir Marg_BEN.csv',
 'Mandir Marg_CO.csv',
 'Mandir Marg_NH3.csv',
 'Mandir Marg_NO1.csv',
 'Mandir Marg_NO2.csv',
 'Mandir Marg_NOX.csv',
 'Mandir Marg_O3.csv',
 'Mandir Marg_pXy.csv',
 'Mandir Marg_SO2.csv',
 'Mandir Marg_Tol.csv']

In [9]:
# function to find index of '_'
lis[0].index('_')

11

In [10]:
# so if we want to extract 'BEN' from the first filename:
lis[0][12:-4]

'BEN'

In [7]:
# let's see how to iterate
for filename in lis:
    print(filename)

Mandir Marg_BEN.csv
Mandir Marg_CO.csv
Mandir Marg_NH3.csv
Mandir Marg_NO1.csv
Mandir Marg_NO2.csv
Mandir Marg_NOX.csv
Mandir Marg_O3.csv
Mandir Marg_pXy.csv
Mandir Marg_SO2.csv
Mandir Marg_Tol.csv


In [11]:
# now let's try to extract the column names
cols = []

for filename in lis:
    cols.append(filename[12:-4])
    
cols

['BEN', 'CO', 'NH3', 'NO1', 'NO2', 'NOX', 'O3', 'pXy', 'SO2', 'Tol']

In [12]:
# the function 'zip' lets you iterate over multiple lists at once

# test zip
for colname,dirname in zip(cols,lis):
    print(colname,dirname)

BEN Mandir Marg_BEN.csv
CO Mandir Marg_CO.csv
NH3 Mandir Marg_NH3.csv
NO1 Mandir Marg_NO1.csv
NO2 Mandir Marg_NO2.csv
NOX Mandir Marg_NOX.csv
O3 Mandir Marg_O3.csv
pXy Mandir Marg_pXy.csv
SO2 Mandir Marg_SO2.csv
Tol Mandir Marg_Tol.csv


In [13]:
# Alright, we are ready now. Let's write some code

answer = pd.DataFrame(columns = cols)

for colname,dirname in zip(cols,lis):
    df = pd.read_csv(dirname)
    answer[colname] = df['value']

# Let's see the final answer: 
answer

Unnamed: 0,BEN,CO,NH3,NO1,NO2,NOX,O3,pXy,SO2,Tol
0,4.8,2.1,31.3,39.0,41.3,93.0,3.9,5.7,2.0,13.8
1,4.6,2.1,31.9,34.2,47.0,91.3,4.5,5.2,1.4,10.5
2,4.7,2.1,31.9,39.7,47.9,100.0,3.6,5.2,0.6,10.2
3,4.6,1.4,29.9,22.2,37.2,65.6,0.9,5.1,4.0,9.7
4,4.6,1.4,27.4,26.7,33.5,68.4,1.3,4.7,3.4,9.5
5,4.3,3.3,34.0,18.8,45.0,67.9,1.9,4.2,4.6,8.5
6,4.6,2.3,43.7,21.7,59.4,85.2,5.8,3.5,9.7,7.3
7,5.1,2.9,44.6,31.5,66.5,105.7,7.9,3.6,5.1,7.1
8,4.9,2.4,44.5,22.5,63.5,90.3,6.0,4.0,7.4,7.7
9,5.1,2.5,51.3,24.9,65.5,95.4,6.0,4.1,9.7,7.7


In [14]:
# information about it:
answer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26608 entries, 0 to 26607
Data columns (total 10 columns):
BEN    26608 non-null object
CO     22585 non-null float64
NH3    25774 non-null float64
NO1    25792 non-null float64
NO2    26163 non-null float64
NOX    26188 non-null float64
O3     24887 non-null float64
pXy    26608 non-null float64
SO2    22456 non-null float64
Tol    26608 non-null float64
dtypes: float64(9), object(1)
memory usage: 2.0+ MB
