In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Pandas (continue)

### References
- https://www.freecodecamp.org/news/how-to-combine-multiple-csv-files-with-8-lines-of-code-265183e0854/

In [None]:
import glob
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [None]:
df = sns.load_dataset('diamonds')
df.shape

In [None]:
df.head()

# 1. Spliting a DataFrame
## 1.1 Splitting by row index
<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width="60%" height="60%" />

In [None]:
df_1 = df.iloc[:1000,:]
df_1.shape

In [None]:
df_2 = df.iloc[1001:,:]
df_2.shape

## 1.2 Splitting dataframe in a particular size (60%, random selection)

In [None]:
df_split = df.sample(frac=0.6, random_state=200)
df_split.reset_index()
df_split.shape

### Splitting dataframe by groups

In [None]:
grouped = df.groupby(df.color)
df_new = grouped.get_group("E")
df_new.shape

In [None]:
df_new.head()

# 2. Join DataFrames

In [None]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df

In [None]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
other

## 2.1 Left join

In [None]:
df2 = df.join(other, how='left', lsuffix='_caller', rsuffix='_other')
df2

## 2.2 Right join

In [None]:
df2 = df.join(other, how='right', lsuffix='_caller', rsuffix='_other')
df2

## 2.3 Inner join

In [None]:
df2 = df.join(other, how='inner', lsuffix='_caller', rsuffix='_other')
df2

## 2.4 Outer join

In [None]:
df2 = df.join(other, how='outer', lsuffix='_caller', rsuffix='_other')
df2

# 3. Working with more datasets
**Concat more files in the same format, then save the combined file for later use**

In [None]:
import glob
import numpy as np
import os
import pandas as pd

In [None]:
dir_in = 'data/input'
print(dir_in)
os.listdir(dir_in)

### Select files

In [None]:
# all files with .csv 
# extension = 'csv'
# filenames = [i for i in glob.glob('*.{}'.format(extension))]

# all files
filenames = [i for i in glob.glob(dir_in + '/*')]
print(*filenames, sep='\n')

### Concat selected files to one combined

In [None]:
combined_file = pd.concat([pd.read_csv(f) for f in filenames ])

### Export to csv

In [None]:
dir_out = '.'
filename_out = 'combined_file.csv'
combined_file.to_csv(dir_out + '/' + filename_out, index=False, encoding='utf-8')

# 4. Linear column removing

## 4.1 Identify and remove columns that contain a single value
**The number of unique values for each column**

In [None]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None)
print(df.nunique())

**Remove these columns**

In [None]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None) 
print(df.shape)

# get number of unique values for each column 
# record columns to delete
counts = df.nunique()
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)

# drop useless columns 
df.drop(to_del, axis=1, inplace=True) 
print(df.shape)

## 4.2 Consider columns that have very few values
**Percentage of unique values in each columns**

In [None]:
from numpy import loadtxt
from numpy import unique

data = loadtxt('data/oil-spill.csv', delimiter=',')

for i in range(data.shape[1]):
    num = len(unique(data[:, i]))
    percentage = float(num) / data.shape[0] * 100 
    print('%d, %d, %.1f%%' % (i, num, percentage))

**Remove these columns**

In [None]:
from pandas import read_csv

df = read_csv('data/oil-spill.csv', header=None)
print(df.shape)

counts = df.nunique()

# record columns to delete if it contains less than 1% values
to_del = [i for i,v in enumerate(counts) if (float(v)/df.shape[0]*100) < 1] 
print(to_del)

# drop useless columns
df.drop(to_del, axis=1, inplace=True)
print(df.shape)

## 4.3 Identify and remove rows that contain duplicate data
**Locate rows of duplicate data**

In [None]:
from pandas import read_csv

df = read_csv('data/iris.csv', header=None) 
dups = df.duplicated()

# report if there are any duplicates 
print(dups.any())

# list all duplicate rows 
print(df[dups])

**Delete Rows That Contain Duplicate Data**

In [None]:
from pandas import read_csv

df = read_csv('data/iris.csv', header=None) 
print(df.shape)

# delete duplicate rows 
df.drop_duplicates(inplace=True) 
print(df.shape)