In [355]:
import pandas as PandaLib
from enum import Enum

# Felix Atanasescu - HE20830




# 1. For the first step I will create an enum class in order to keep track of my 
# loaded files with ease, as I am planning to add the files in a map container, and the IDs will 
# act as the key for each file.  

class FileType(Enum):
    BOOKS = 'books'
    LISTINGS = 'listings'
    PHONE_DATA = 'phone_data'
    OLYMPICS = 'tolympics' 

dataFrame = {
    FileType.BOOKS: PandaLib.read_csv('books.csv'),
    FileType.LISTINGS: PandaLib.read_csv('listings.csv'),
    FileType.PHONE_DATA: PandaLib.read_csv('phone_data.csv'),
    FileType.OLYMPICS: PandaLib.read_csv('tolympics.csv') # broken header, so I will not load it
}

# For utility I will create a print function and declare some strings,to print text. 
# Having to repeat the text over and over is unnecessary.

def LogMessage(message):
    print("[INFO] " + message + "\n")

def LogFileReading(fileTypeID):
    fileReadBegins = f"READING FROM THE FILE----------- {fileTypeID.value} ------------ \n"
    print("[INFO]: " + fileReadBegins)

def LogFileEndReading(fileTypeID):
    fileReadEnds = f"ENDING READING FOR THE FILE ----------- {fileTypeID.value} ------------ \n"
    print("[INFO]: " + fileReadEnds)
# Done 

In [164]:
# 2.
for fileTypeID, someData in dataFrame.items():
    LogFileReading(fileTypeID)
    
    print("First 10 rows:")
    print(someData.head(10)) 

    print("\nLast 10 rows:")  
    print(someData.tail(10))  
    LogFileEndReading(fileTypeID)
# Done

[INFO]: READING FROM THE FILE----------- books ------------ 

First 10 rows:
   Identifier              Edition Statement  \
0         206                            NaN   
1         216                            NaN   
2         218                            NaN   
3         472                            NaN   
4         480   A new edition, revised, etc.   
5         481  Fourth edition, revised, etc.   
6         519                            NaN   
7         667                            NaN   
8         874                            NaN   
9        1143                            NaN   

                  Place of Publication Date of Publication  \
0                               London         1879 [1878]   
1             London; Virtue & Yorston                1868   
2                               London                1869   
3                               London                1851   
4                               London                1857   
5                     

In [166]:
# 3. For the thrid step, I wll crate a simple array of strings for each file that will
# represent 3 columns each.
books_columns_to_show = ['Title', 'Author', 'Place of Publication'] 
listings_columns_to_show = ['room_type', 'price', 'availability_365']  
phone_data_columns_to_show = ['index', 'network', 'network_type'] 
olympics_columns_to_show = ['1', '2', '3']

# Once that done, I will iterate through the map and print the details of the columns.
# Since it is a map of files, each file has an ID,
# so in order to print the desired columns data for each file
# I will have to use conditions 
# Explanation: if the n0 file is currently iterated, "print the data of this file",
# else if n1 is currently iterated, "print the data of n1 file", etc.

for fileTypeID, data in dataFrame.items():
    LogFileReading(fileTypeID)
    
    if fileTypeID == FileType.BOOKS:
        print(data[books_columns_to_show]) 
             
    elif fileTypeID == FileType.LISTINGS:
        print(data[listings_columns_to_show]) 
              
    elif fileTypeID == FileType.PHONE_DATA:
        print(data[phone_data_columns_to_show])
              
    elif fileTypeID == FileType.OLYMPICS:
        print(data[olympics_columns_to_show])

    LogFileEndReading(fileTypeID)
# Done

[INFO]: READING FROM THE FILE----------- books ------------ 

                                                  Title  \
0                     Walter Forbes. [A novel.] By A. A   
1     All for Greed. [A novel. The dedication signed...   
2     Love the Avenger. By the author of “All for Gr...   
3     Welsh Sketches, chiefly ecclesiastical, to the...   
4     [The World in which I live, and my place in it...   
...                                                 ...   
8282  The Parochial History of Cornwall, founded on,...   
8283  The History and Gazetteer of the County of Der...   
8284  Magna Britannia; being a concise topographical...   
8285  An historical, topographical and descriptive v...   
8286  Collectanea Topographica et Genealogica. [Firs...   

                                                 Author  \
0                                                 A. A.   
1                                             A., A. A.   
2                                             A., A.

In [170]:
# Step 4. Next I will add a new column to each dataFrame that I previously 
# loaded. The new name "The_New_Frame_Column" and it can have any desired value.
# The steps from above will need to be repeated since there are separate exercises, otherwise, 
# I could have fit this step and the previous one into a single one, simplyfing the work by reducing 
# it to one iterator and conditional check, but for the sake of separation of concerns, and exercise 
# requirements, I will do it in a new cell. ^_^

columnName = "The_New_Column"
columnValue = 0

for fileTypeID, data in dataFrame.items():
    LogFileReading(fileTypeID)

    columnValue += 1
    
    if fileTypeID == FileType.BOOKS:
        data[columnName] = columnValue
        
    elif fileTypeID == FileType.LISTINGS:
        data[columnName] = columnValue

    elif fileTypeID == FileType.PHONE_DATA:
        data[columnName] = columnValue
    
    elif fileTypeID == FileType.OLYMPICS:
        data[columnName] = columnValue
        
    print("The new column added:")    
    print(data.head())
    LogFileEndReading(fileTypeID) 
#Done 

[INFO]: READING FROM THE FILE----------- books ------------ 

The new column added:
   Identifier             Edition Statement      Place of Publication  \
0         206                           NaN                    London   
1         216                           NaN  London; Virtue & Yorston   
2         218                           NaN                    London   
3         472                           NaN                    London   
4         480  A new edition, revised, etc.                    London   

  Date of Publication              Publisher  \
0         1879 [1878]       S. Tinsley & Co.   
1                1868           Virtue & Co.   
2                1869  Bradbury, Evans & Co.   
3                1851          James Darling   
4                1857   Wertheim & Macintosh   

                                               Title     Author  \
0                  Walter Forbes. [A novel.] By A. A      A. A.   
1  All for Greed. [A novel. The dedication signed...  

In [224]:
# Step 5: For this step, I will choose a specific column for each data frame to
# calculate their average values. 
# To do so, I will create a new map for each file pointing to the chosen column.

average_column = {
    FileType.BOOKS: 'Identifier',
    FileType.LISTINGS: 'price',
    FileType.PHONE_DATA: 'duration',
    FileType.OLYMPICS: '5'
}

for fileTypeID, someData in dataFrame.items():
    LogFileReading(fileTypeID)
    
    # Need to write a safety guard to check if the column exists,
    # otherwise runtime errors / undefined behaviours might occur.
    # Since it can only be eiter true or false, I will do 
    # a negation instead of a straight forward if else, a good practice standard
    # which will reduce the amount of written code, and therefore -> more readable 
    
    if average_column[fileTypeID] not in someData.columns: 
        LogMessage(f"Column '{average_column[fileTypeID]}' not found in {fileTypeID.value}.")
        continue  # Skip to the next iteration if the column is not found.

    someData[average_column[fileTypeID]] = PandaLib.to_numeric(someData[average_column[fileTypeID]], errors='coerce')
    average_value = someData[average_column[fileTypeID]].mean()
    LogMessage(f"The average of {average_column[fileTypeID]} in {fileTypeID.value}: {average_value:.2f}")

# Errors:
#  1. The safety guards were indeed useful, the output has shown that some of the columns don't existt.
#     why: case sensitive 
#     what I did: I had to double-check the column cases and correct in the map where needed. 

#  2. Due to the fact that the rows are numbered manually and under them there are strings,
#     I converted the string value to a numeric value. this might lead to some loss of accuracy, 
#     but for now should be fine.

# done

[INFO]: READING FROM THE FILE----------- books ------------ 

[INFO] The average of Identifier in books: 2017344.16

[INFO]: READING FROM THE FILE----------- listings ------------ 

[INFO] The average of price in listings: 110.02

[INFO]: READING FROM THE FILE----------- phone_data ------------ 

[INFO] The average of duration in phone_data: 117.80

[INFO]: READING FROM THE FILE----------- tolympics ------------ 

[INFO] The average of 5 in tolympics: 200.19



In [284]:
# Step 6: For this step, I will display only the rows where the value 
# in any column is greater than 90.
# So I will start by iterating the map once more

filterinValue = 90 # no idea how to actually declare a const in python xD

for fileTypeID, someData in dataFrame.items():
    LogFileReading(fileTypeID)
    
    # I convert all values that are not valid(strings, chars, etc) to numerics, 
    # avoiding the errors from step 5. ("Fool me once... No more string calculation errors (-_-)")
    
    someData = someData.apply(PandaLib.to_numeric, errors='coerce')
    
    # Filter rows where any column has a value greater than filteringValue
    dataFilter = someData[someData > filteringValue].dropna(how='all')

    # Beautify the output by replacing the "NaN" with an empty space
    dataFilter = dataFilter.fillna(" ")
    
    LogMessage("Rows where any value is above 90: ")
    LogMessage(dataFilter.to_string())
    LogFileEndReading(fileTypeID)
    

[INFO]: READING FROM THE FILE----------- books ------------ 

[INFO] Rows where any value is above 90: 

[INFO]       Identifier Edition Statement Place of Publication Date of Publication Publisher Title Author Contributors Corporate Author Corporate Contributors Former owner Engraver Issuance type Flickr URL Shelfmarks
0            206                                                                                                                                                                                                 
1            216                                                     1868.0                                                                                                                                      
2            218                                                     1869.0                                                                                                                                      
3            472                                

In [317]:
# Step 7. For this step, I will check if there are any missing values 
# in the rows of each DataFrame. 
# - If any missing values are found, I will fill them with the character "M"

for fileTypeID, someData in dataFrame.items():
    LogFileReading(fileTypeID)

    # Debugging - Logging the current data frame content for verification
    
    # Check for !null 
    if not someData.isnull().values.any():
        LogMessage("No missing values found within Data")
        continue
        
    # If missing values are found
    LogMessage("Missing values have been found! Filling with 'M': ")

    # but first we need type conversion to string.
    someData = someData.astype(str)
    someData.fillna("M", inplace=True)
        
    # Log the modified DataFrame
    LogMessage(f"Updated data for {fileTypeID} after filling NaNs:")
    LogMessage(someData.to_string())
 
# BUG: "??? No missing values found ???". 
# Potential logic error in my approach or misunderstood 
# the problem context - To fix later.

# Done (not quite)

[INFO]: READING FROM THE FILE----------- books ------------ 

[INFO] No missing values found within Data

[INFO]: READING FROM THE FILE----------- listings ------------ 

[INFO] No missing values found within Data

[INFO]: READING FROM THE FILE----------- phone_data ------------ 

[INFO] No missing values found within Data

[INFO]: READING FROM THE FILE----------- tolympics ------------ 

[INFO] No missing values found within Data



In [347]:
# Step 8. For this step, I will sort a specified column in 
# each DataFrame in ascending order.

book_column = 'Identifier'
phone_column = 'duration'
listing_column = 'price'
olympics_column = '5'

for fileTypeID, someData in dataFrame.items():
    LogFileReading(fileTypeID)

    if fileTypeID == FileType.BOOKS:
        sortedData = someData.sort_values(by=book_column, ascending=True)
        LogMessage(f"Sorted data for {fileTypeID} by column '{book_column}' in ascending order:")
             
    elif fileTypeID == FileType.LISTINGS:
        sortedData = someData.sort_values(by=listing_column, ascending=True)
        LogMessage(f"Sorted data for {fileTypeID} by column '{listing_column}' in ascending order:")
              
    elif fileTypeID == FileType.PHONE_DATA:
        sortedData = someData.sort_values(by=phone_column, ascending=True)
        LogMessage(f"Sorted data for {fileTypeID} by column '{phone_column}' in ascending order:")
              
    elif fileTypeID == FileType.OLYMPICS:
        sortedData = someData.sort_values(by=olympics_column, ascending=True)
        LogMessage(f"Sorted data for {fileTypeID} by column '{olympics_column}' in ascending order:")

# Done

[INFO]: READING FROM THE FILE----------- books ------------ 

[INFO] Sorted data for FileType.BOOKS by column 'Identifier' in ascending order:

[INFO]: READING FROM THE FILE----------- listings ------------ 

[INFO] Sorted data for FileType.LISTINGS by column 'price' in ascending order:

[INFO]: READING FROM THE FILE----------- phone_data ------------ 

[INFO] Sorted data for FileType.PHONE_DATA by column 'duration' in ascending order:

[INFO]: READING FROM THE FILE----------- tolympics ------------ 

[INFO] Sorted data for FileType.OLYMPICS by column '5' in ascending order:



In [357]:
# Step 9. Exporting to csv

for fileTypeID, someData in dataFrame.items():
    modified_filename = f"modified_{fileTypeID}.csv"
    
    someData.to_csv(modified_filename, index=False)
    LogMessage(f"Exported modified data to {modified_filename}")

#Done

[INFO] Exported modified data to modified_FileType.BOOKS.csv

[INFO] Exported modified data to modified_FileType.LISTINGS.csv

[INFO] Exported modified data to modified_FileType.PHONE_DATA.csv

[INFO] Exported modified data to modified_FileType.OLYMPICS.csv

