In [1]:
import pandas as pd
import numpy as np

In [2]:
import chardet

def read_data(filepath, filetype="csv"):
 
  if filetype not in ["csv", "xlsx"]:
    raise ValueError(f"Unsupported file type: {filetype}")

  try:
    # Open the file in binary mode and read the initial bytes for encoding detection
    with open(filepath, 'rb') as rawdata:
      content = rawdata.read(1024)  # Adjust buffer size as needed

    # Use chardet to detect encoding
    encoding_info = chardet.detect(content)
    encoding = encoding_info['encoding']

    # Read data using the detected encoding
    if filetype == "csv":
      data = pd.read_csv(filepath, encoding=encoding)
    else:
      data = pd.read_excel(filepath)
    return data

  except FileNotFoundError:
    raise FileNotFoundError(f"File not found: {filepath}")

  except UnicodeDecodeError:
    raise UnicodeDecodeError(f"Failed to decode file even with detected encoding: {encoding}")


In [3]:
def detect_column_types(df):
  

  column_types = {}

  for col in df.columns:
    column_data = df[col]
    
    # Check for numeric types
    if pd.api.types.is_numeric_dtype(column_data):
      column_types[col] = 'numeric'
    elif pd.api.types.is_datetime64_dtype(column_data):
      column_types[col] = 'datetime'
    elif isinstance(column_data.dtype, pd.CategoricalDtype):
      column_types[col] = 'categorical'
    else:
      # Handle mixed types or text data
      unique_values = column_data.unique()
      if len(unique_values) <= 10:
        column_types[col] = 'categorical'  # Consider few unique values as categorical
      else:
        column_types[col] = 'text'

  return column_types


In [4]:
# Test CSV
try:
  df = read_data("Speaker physical.csv")
  print("CSV read successfully!")
except (ValueError, FileNotFoundError) as e:
  print(f"Error reading CSV: {e}")

# Test Excel
""" try:
  df = read_data("Speaker physical.xlsx", filetype="xlsx")
  print("Excel read successfully!")
except (ValueError, FileNotFoundError) as e:
  print(f"Error reading Excel: {e}")
 """

CSV read successfully!


' try:\n  df = read_data("Speaker physical.xlsx", filetype="xlsx")\n  print("Excel read successfully!")\nexcept (ValueError, FileNotFoundError) as e:\n  print(f"Error reading Excel: {e}")\n '

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,,,,,,,
1,SN,Name,Sex,Country,Affliation,Mode of presentation,Comments,Sponsorship,Email,Phone number
2,1,Josephine Mebawondu,Female,NIGERIA,Afe Babalola University Ado-Ekiti,Physical,Recommended,YES,jpmebawondu@gmail.com,2.34806E+12
3,2,"ADEGUN, Iyanu Pelumi",Female,NIGERIA,"FEDERAL UNIVERSITY OF TECHNOLOGY, AKURE",Physical,Recommended,NO,iyanupelumi22@gmail.com,2.34814E+12
4,3,Castro Gbêmêmali HOUNMENOU,Male,Guinea,Centre de Recherche et de Formation en Infecti...,Physical,Recommended,NO,castro.hounmenou@cerfig.org,2.24628E+11


In [6]:
null_data = df[df.isnull().any(axis=1)]

In [7]:
null_data

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,,,,,,,
26,25.0,Aurelle TCHAGNA,Male,Cameroon,"College of Technology, University of Buea, Cam...",Virtual,Recommended/Good,NO,tkaurelle@gmail.com,


In [8]:
column_types = detect_column_types(df)
column_types

{'Unnamed: 0': 'text',
 'Unnamed: 1': 'text',
 'Unnamed: 2': 'categorical',
 'Unnamed: 3': 'categorical',
 'Unnamed: 4': 'text',
 'Unnamed: 5': 'categorical',
 'Unnamed: 6': 'categorical',
 'Unnamed: 7': 'categorical',
 'Unnamed: 8': 'text',
 'Unnamed: 9': 'text'}

In [12]:
data = read_data("Speaker physical.csv")

def wrangle(data, col_missing_threshold=60, row_missing_threshold=60):

  # Handle empty DataFrame
  if data.empty:
      return pd.DataFrame()

  # Calculate missing value percentage for columns
  col_missing_percentage = (data.isnull().sum() / len(data)) * 100
  cols_to_drop = col_missing_percentage[col_missing_percentage > col_missing_threshold].index
  data.drop(columns=cols_to_drop, inplace=True)

  # Calculate missing value percentage for rows
  row_missing_percentage = (data.isnull().sum(axis=1) / len(data.columns)) * 100
  rows_to_drop = row_missing_percentage[row_missing_percentage > row_missing_threshold].index
  data.drop(index=rows_to_drop, inplace=True)


  return data, column_types


In [13]:
wrangle(data)

(   Unnamed: 0                       Unnamed: 1 Unnamed: 2 Unnamed: 3  \
 1          SN                            Name         Sex    Country   
 2           1              Josephine Mebawondu     Female    NIGERIA   
 3           2             ADEGUN, Iyanu Pelumi     Female    NIGERIA   
 4           3       Castro Gbêmêmali HOUNMENOU       Male     Guinea   
 5           4  Mubarak Mohammed Al Ezzi Sufyan       Male      Yemen   
 6           5          Agboola Grace Temidayo      Female    NIGERIA   
 7           6          AYEGBUSI  FLORENCE DAMI     Female    NIGERIA   
 8           7                  Nafisah Sumaila     Female      Ghana   
 9           8               Oluyemi E. ADETOYI     Female    Nigeria   
 10          9                    Sola Babalola       Male    Nigeria   
 11         10       Theresa Funmilayo Otokola      Female    Nigeria   
 12         11       Boluwatife Mercy Ogunjirin     Female    Nigeria   
 13         12            Asegunloluwa Babalola    

In [14]:
data

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
1,SN,Name,Sex,Country,Affliation,Mode of presentation,Comments,Sponsorship,Email,Phone number
2,1,Josephine Mebawondu,Female,NIGERIA,Afe Babalola University Ado-Ekiti,Physical,Recommended,YES,jpmebawondu@gmail.com,2.34806E+12
3,2,"ADEGUN, Iyanu Pelumi",Female,NIGERIA,"FEDERAL UNIVERSITY OF TECHNOLOGY, AKURE",Physical,Recommended,NO,iyanupelumi22@gmail.com,2.34814E+12
4,3,Castro Gbêmêmali HOUNMENOU,Male,Guinea,Centre de Recherche et de Formation en Infecti...,Physical,Recommended,NO,castro.hounmenou@cerfig.org,2.24628E+11
5,4,Mubarak Mohammed Al Ezzi Sufyan,Male,Yemen,Ministry of Local Administration at Yemen,Virtual,Recommended,NO,mub.sufyan2015@gmail.com,92-3379748822
6,5,Agboola Grace Temidayo,Female,NIGERIA,Afe Babalola University Ado-Ekiti,Physical,Recommended,YES,temidayoagboola177@gmail.com,09076274339 09049347802
7,6,AYEGBUSI FLORENCE DAMI,Female,NIGERIA,FIRST TECHNICAL UNIVERSITY IBADAN,Physical,Recommended/Good,YES,florahdammmy@gmail.com,2.34703E+12
8,7,Nafisah Sumaila,Female,Ghana,University for Development Studies,Physical,Recommended/Good,YES,sumyla2nafisa@gmail.com,2.33245E+11
9,8,Oluyemi E. ADETOYI,Female,Nigeria,University of Ibadan,Physical,Recommended/Good,YES,yemi.ade.oe@gmail.com,8059228970
10,9,Sola Babalola,Male,Nigeria,Elizade University,Physical,Recommended,NO,b_sola@yahoo.com,8034505004
