In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Pandas (continue)

### References
- https://www.freecodecamp.org/news/how-to-combine-multiple-csv-files-with-8-lines-of-code-265183e0854/

In [2]:
import glob
import numpy as np
import os
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset('diamonds')
df.shape

(53940, 10)

In [11]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Spliting a DataFrame
### Splitting by row index
<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width="60%" height="60%" />

In [5]:
df_1 = df.iloc[:1000,:]
df_1.shape

(1000, 10)

In [7]:
df_2 = df.iloc[1001:,:]
df_2.shape

(52939, 10)

### Splitting dataframe in a particular size (60%, random selection)

In [8]:
df_split = df.sample(frac=0.6, random_state=200)
df_split.reset_index()
df_split.shape

(32364, 10)

### Splitting dataframe by groups

In [9]:
grouped = df.groupby(df.color)
df_new = grouped.get_group("E")
df_new.shape

(9797, 10)

In [10]:
df_new.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
14,0.2,Premium,E,SI2,60.2,62.0,345,3.79,3.75,2.27


# Join DataFrames

In [None]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
df

In [None]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
other

### Left join

In [None]:
df2 = df.join(other, how='left', lsuffix='_caller', rsuffix='_other')
df2

### Right join

In [None]:
df2 = df.join(other, how='right', lsuffix='_caller', rsuffix='_other')
df2

### Inner join

In [None]:
df2 = df.join(other, how='inner', lsuffix='_caller', rsuffix='_other')
df2

### Outer join

In [None]:
df2 = df.join(other, how='outer', lsuffix='_caller', rsuffix='_other')
df2

# Working with more datasets
**Concat more files in the same format, then save the combined file for later use**

In [None]:
import glob
import numpy as np
import os
import pandas as pd

In [None]:
dir_in = 'data/input'
print(dir_in)
os.listdir(dir_in)

### Select files

In [None]:
# all files with .csv 
# extension = 'csv'
# filenames = [i for i in glob.glob('*.{}'.format(extension))]

# all files
filenames = [i for i in glob.glob(dir_in + '/*')]
print(*filenames, sep='\n')

### Concat selected files to one combined

In [None]:
combined_file = pd.concat([pd.read_csv(f) for f in filenames ])

### Export to csv

In [None]:
dir_out = '.'
filename_out = 'combined_file.csv'
combined_file.to_csv(dir_out + '/' + filename_out, index=False, encoding='utf-8')