# Importing necessary libraries

In [2]:
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re
import testutility as util

# Reading the dataset... 

In [2]:
def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

In [3]:
def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

In [4]:
def col_header_val(df, table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]', '_', regex = True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x, '_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns = list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis = 1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file", mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded", missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

# Schema YAML file creation;

In [26]:
%%writefile file.yaml
file_type: csv 
dataset_name: netflix movie history
file_name: train
table_name: watch records
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - movie
    - user

Overwriting file.yaml


# Configuration of the YAML file just created...

In [27]:
config_data = util.read_config_file("file.yaml")

In [28]:
config_data

{'file_type': 'csv',
 'dataset_name': 'netflix movie history',
 'file_name': 'train',
 'table_name': 'watch records',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['movie', 'user']}

# Displaying the original dataset...

In [8]:
df_sample = pd.read_csv("train.csv", delimiter = ',')
df_sample.head()

Unnamed: 0,movie,user,rating,date
0,10341,510180,4,1999-11-11
1,1798,510180,5,1999-11-11
2,10774,510180,3,1999-11-11
3,8651,510180,2,1999-11-11
4,14660,510180,2,1999-11-11


# Here's where we start making the first steps of validation between the two files (YAML & original dataset).

In [29]:
file_type = config_data['file_type']

In [30]:
source_file = " " + config_data['file_name'] + f'.{file_type}'

In [31]:
print("Netflix's movies watch history... ", source_file)

Netflix's movies watch history...   train.csv


# Let's see which columns are avilable/missing from the original dataset;

In [19]:
util.col_header_val(df_sample, config_data)

column name and column length validation failed
Following File columns are not in the YAML file ['date', 'rating']
Following YAML columns are not in the file uploaded []


0

In [35]:
print("columns of files are:" ,df_sample.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['movie', 'user', 'rating', 'date'], dtype='object')
columns of YAML are: ['movie', 'user']


# From this, we clearly see that two columns are missing from the original data.

In [36]:
if util.col_header_val(df_sample,config_data)==0:
    print("validation failed")
else:
    print("col validation passed")

column name and column length validation failed
Following File columns are not in the YAML file ['date', 'rating']
Following YAML columns are not in the file uploaded []
validation failed


# And hence by, the validation command gets rejected.

# Summary;

## dataset: Netflix's movies watch history.
## file name: train.csv
## file size = 2.088 GB
## rows = ~2,000,000
## columns = 4

# Here, we acknowledge that the YAML file we've just created (file.yaml), does not validate with our original dataset (train.csv), because it misses two columns from the original file.
# Therefore, Verfifications fails.


# Thanks for taking the time to check out my project :)