In [1]:
# Install necessary libraries
!pip install pandas  pyyaml

import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Function to read the file using different approaches
def read_file(file_path, method='pandas'):
    if method == 'pandas':
        return pd.read_csv(file_path)
    else:
        raise ValueError("Invalid method specified")


In [3]:
# Function to perform basic validation on data columns
def clean_column_names(df):
    df.columns = df.columns.str.replace('[^\w\s]', '').str.strip()
    return df


In [4]:
# Function to write the file in pipe separated text file (|) in gz format
def write_file(df, output_file, separator='|'):
    df.to_csv(output_file, sep=separator, compression='gzip', index=False)


In [5]:
# Function to create a YAML file with column names
def create_yaml_schema(df, yaml_file, separator=','):
    schema = {'columns': list(df.columns), 'separator': separator}
    with open(yaml_file, 'w') as file:
        yaml.dump(schema, file)


In [6]:
# Function to validate number of columns and column names with YAML
def validate_with_yaml(df, yaml_file):
    with open(yaml_file, 'r') as file:
        schema = yaml.safe_load(file)

    expected_columns = schema['columns']
    if len(df.columns) != len(expected_columns) or list(df.columns) != expected_columns:
        raise ValueError("Column validation failed")

In [7]:
# Function to create a summary of the file
def file_summary(df):
    total_rows = len(df)
    total_columns = len(df.columns)
    file_size = df.memory_usage(deep=True).sum() / (1024 ** 2)  # in MB
    return total_rows, total_columns, file_size

In [8]:
# Set your file path in Google Drive
file_path = '/content/drive/MyDrive/steam_reviews.csv'
output_file = '/content/drive/MyDrive/output_file_steam_reviews.csv.gz'
yaml_file = '/content/drive/MyDrive/schema_steam_reviews.yaml'

In [9]:
# Read file using different methods
df_pandas = read_file(file_path, method='pandas')


In [10]:
# Clean column names
df_pandas = clean_column_names(df_pandas)



  df.columns = df.columns.str.replace('[^\w\s]', '').str.strip()


In [11]:
# Write file in pipe separated text file in gz format
write_file(df_pandas, output_file)


In [14]:
# Create YAML schema file
import yaml
create_yaml_schema(df_pandas, yaml_file)

In [15]:
# Validate number of columns and column names with YAML
validate_with_yaml(df_pandas, yaml_file)

In [16]:
# Create a summary of the file
summary_pandas = file_summary(df_pandas)


In [17]:
print("Summary:", summary_pandas)

Summary: (21747371, 23, 11639.289216041565)


In [18]:
def pandas_detailed_summary(df):
    summary = {'total_rows': len(df), 'total_columns': len(df.columns), 'file_size_MB': df.memory_usage(deep=True).sum() / (1024 ** 2)}

    # Column-wise information
    columns_info = {}
    for col in df.columns:
        col_info = {
            'data_type': str(df[col].dtype),
            'null_count': df[col].isnull().sum(),
            'unique_values': df[col].nunique(),
            'top_value': df[col].mode().iloc[0],
            'frequency': df[col].value_counts().iloc[0]
        }
        columns_info[col] = col_info

    summary['columns_info'] = columns_info
    return summary

In [19]:
# Create a detailed Pandas summary
detailed_summary_pandas = pandas_detailed_summary(df_pandas)

# Print the detailed Pandas summary
print("Detailed Pandas Summary:")
print(detailed_summary_pandas)

Detailed Pandas Summary:
{'total_rows': 21747371, 'total_columns': 23, 'file_size_MB': 11639.289216041565, 'columns_info': {'Unnamed 0': {'data_type': 'int64', 'null_count': 0, 'unique_values': 21747371, 'top_value': 0, 'frequency': 1}, 'app_id': {'data_type': 'int64', 'null_count': 0, 'unique_values': 315, 'top_value': 578080, 'frequency': 1644255}, 'app_name': {'data_type': 'object', 'null_count': 0, 'unique_values': 315, 'top_value': "PLAYERUNKNOWN'S BATTLEGROUNDS", 'frequency': 1644255}, 'review_id': {'data_type': 'int64', 'null_count': 0, 'unique_values': 21612444, 'top_value': 30150973, 'frequency': 2}, 'language': {'data_type': 'object', 'null_count': 0, 'unique_values': 28, 'top_value': 'english', 'frequency': 9635437}, 'review': {'data_type': 'object', 'null_count': 33742, 'unique_values': 16657837, 'top_value': 'good', 'frequency': 106671}, 'timestamp_created': {'data_type': 'int64', 'null_count': 0, 'unique_values': 18932372, 'top_value': 1542902093, 'frequency': 268}, 'time