In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Pandas (continue)

URL https://www.freecodecamp.org/news/how-to-combine-multiple-csv-files-with-8-lines-of-code-265183e0854/

In [1]:
import glob
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [7]:
df = sns.load_dataset('diamonds')
df.shape

(53940, 10)

In [8]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# 1. Spliting a DataFrame
## 1.1 Splitting by row index
<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width="60%" height="60%" />

In [5]:
df_1 = df.iloc[:1000,:]
df_1.shape

(1000, 10)

In [6]:
df_2 = df.iloc[1001:,:]
df_2.shape

(52939, 10)

## 1.2 Splitting dataframe in a particular size (60%, random selection)

In [9]:
df_split = df.sample(frac=0.6, random_state=200)
df_split.reset_index()
df_split.shape

(32364, 10)

### Splitting dataframe by groups

In [None]:
grouped = df.groupby(df.color)
df_new = grouped.get_group("E")
df_new.shape

In [None]:
df_new.head()

# 2. Join DataFrames
<img src="https://i.stack.imgur.com/h9Pln.png">

In [None]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df

In [None]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
other

## 2.1 Left join

In [None]:
df2 = df.join(other, how='left', lsuffix='_caller', rsuffix='_other')
df2

## 2.2 Right join

In [None]:
df2 = df.join(other, how='right', lsuffix='_caller', rsuffix='_other')
df2

## 2.3 Inner join

In [None]:
df2 = df.join(other, how='inner', lsuffix='_caller', rsuffix='_other')
df2

## 2.4 Outer join

In [None]:
df2 = df.join(other, how='outer', lsuffix='_caller', rsuffix='_other')
df2

# 3. Working with more datasets
**Concat more files in the same format, then save the combined file for later use**

In [10]:
import glob
import numpy as np
import os
import pandas as pd

In [11]:
dir_in = 'data/input'
print(dir_in)
os.listdir(dir_in)

data/input


['FR040120000800100hour.1-1-1999.31-12-2012',
 'FR040370000800100hour.1-1-1999.31-12-2012',
 'BETN0290000800100hour.1-1-1990.31-12-2012',
 'BETR8010000800100hour.1-1-1990.31-12-2012']

### Select files

In [12]:
# all files with .csv 
# extension = 'csv'
# filenames = [i for i in glob.glob('*.{}'.format(extension))]

# all files
filenames = [i for i in glob.glob(dir_in + '/*')]
print(*filenames, sep='\n')

data/input/FR040120000800100hour.1-1-1999.31-12-2012
data/input/FR040370000800100hour.1-1-1999.31-12-2012
data/input/BETN0290000800100hour.1-1-1990.31-12-2012
data/input/BETR8010000800100hour.1-1-1990.31-12-2012


### Concat selected files to one combined

In [13]:
combined_file = pd.concat([pd.read_csv(f) for f in filenames ])

### Export to csv

In [14]:
dir_out = '.'
filename_out = 'combined_file.csv'
combined_file.to_csv(dir_out + '/' + filename_out, index=False, encoding='utf-8')

# 4. Linear column removing

## 4.1 Identify and remove columns that contain a single value
**The number of unique values for each column**

In [15]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None)
print(df.nunique())

0     238
1     297
2     927
3     933
4     179
5     375
6     820
7     618
8     561
9      57
10    577
11     59
12     73
13    107
14     53
15     91
16    893
17    810
18    170
19     53
20     68
21      9
22      1
23     92
24      9
25      8
26      9
27    308
28    447
29    392
30    107
31     42
32      4
33     45
34    141
35    110
36      3
37    758
38      9
39      9
40    388
41    220
42    644
43    649
44    499
45      2
46    937
47    169
48    286
49      2
dtype: int64


**Remove these columns**

In [16]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None) 
print(df.shape)

# get number of unique values for each column 
# record columns to delete
counts = df.nunique()
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)

# drop useless columns 
df.drop(to_del, axis=1, inplace=True) 
print(df.shape)

(937, 50)
[22]
(937, 49)


## 4.2 Consider columns that have very few values
**Percentage of unique values in each columns**

In [17]:
from numpy import loadtxt
from numpy import unique

data = loadtxt('data/oil-spill.csv', delimiter=',')

for i in range(data.shape[1]):
    num = len(unique(data[:, i]))
    percentage = float(num) / data.shape[0] * 100 
    print('%d, %d, %.1f%%' % (i, num, percentage))

0, 238, 25.4%
1, 297, 31.7%
2, 927, 98.9%
3, 933, 99.6%
4, 179, 19.1%
5, 375, 40.0%
6, 820, 87.5%
7, 618, 66.0%
8, 561, 59.9%
9, 57, 6.1%
10, 577, 61.6%
11, 59, 6.3%
12, 73, 7.8%
13, 107, 11.4%
14, 53, 5.7%
15, 91, 9.7%
16, 893, 95.3%
17, 810, 86.4%
18, 170, 18.1%
19, 53, 5.7%
20, 68, 7.3%
21, 9, 1.0%
22, 1, 0.1%
23, 92, 9.8%
24, 9, 1.0%
25, 8, 0.9%
26, 9, 1.0%
27, 308, 32.9%
28, 447, 47.7%
29, 392, 41.8%
30, 107, 11.4%
31, 42, 4.5%
32, 4, 0.4%
33, 45, 4.8%
34, 141, 15.0%
35, 110, 11.7%
36, 3, 0.3%
37, 758, 80.9%
38, 9, 1.0%
39, 9, 1.0%
40, 388, 41.4%
41, 220, 23.5%
42, 644, 68.7%
43, 649, 69.3%
44, 499, 53.3%
45, 2, 0.2%
46, 937, 100.0%
47, 169, 18.0%
48, 286, 30.5%
49, 2, 0.2%


**Remove these columns**

In [None]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None)
print(df.shape)

counts = df.nunique()

# record columns to delete if it contains less than 1% values
to_del = [i for i,v in enumerate(counts) if (float(v)/df.shape[0]*100) < 1] 
print(to_del)

# drop useless columns
df.drop(to_del, axis=1, inplace=True)
print(df.shape)

## 4.3 Identify and remove rows that contain duplicate data
**Locate rows of duplicate data**

In [18]:
from pandas import read_csv

df = read_csv('data/iris.csv', header=None) 
dups = df.duplicated()

# report if there are any duplicates 
print(dups.any())

# list all duplicate rows 
print(df[dups])

True
       0    1    2    3               4
34   4.9  3.1  1.5  0.1     Iris-setosa
37   4.9  3.1  1.5  0.1     Iris-setosa
142  5.8  2.7  5.1  1.9  Iris-virginica


**Delete Rows That Contain Duplicate Data**

In [None]:
from pandas import read_csv

df = read_csv('data/iris.csv', header=None) 
print(df.shape)

# delete duplicate rows 
df.drop_duplicates(inplace=True) 
print(df.shape)