In [4]:
import pandas as pd


CLEANING DATA

In [5]:
import os

# Path to Folder storing Multiple Folders with CSV files
main_folder_path = '/Users/francinasimone/Desktop/DreamApp1/Starlight/First_People/'

# List all sub-folders (tribes) inside the main folder
tribe_folders = [f for f in os.listdir(main_folder_path) if os.path.isdir(os.path.join(main_folder_path, f))]

# This list will hold dataframes of all tribes
all_dfs = []

for tribe in tribe_folders:
    tribe_folder_path = os.path.join(main_folder_path, tribe)
    csv_files = [f for f in os.listdir(tribe_folder_path) if f.endswith('.csv')]
    
    # List to hold DataFrames of current tribe's CSVs
    tribe_dfs = [pd.read_csv(os.path.join(tribe_folder_path, csv_file)) for csv_file in csv_files]
    
    # Concatenate the tribe's dataframes and append to all_dfs
    all_dfs.append(pd.concat(tribe_dfs, ignore_index=True))

# Finally, concatenate all tribes' dataframes
final_df = pd.concat(all_dfs, ignore_index=True)


DATA EXAMINATION

In [None]:
print(final_df.head())
print(final_df.info())


In [15]:
final_df.iloc[:,0]

Series([], Name: \nGrizzly\nA Shasta Legend\nBefore people were on the Earth, the Chief of the Great Sky Spirits\ngrew tired of his home in the Above World because it was always\ncold. So he made a hole in the sky by turning a stone around and\naround. Through the hole he pushed snow and ice until he made a\nbig mound. This mound was Mount Shasta.\nThen Sky Spirit stepped from the sky to the mountain and walked\ndown. When he got about halfway down, he thought: "On this\nmountain there should be trees." So he put his finger down\nand everywhere he touched, up sprang trees. Everywhere he stepped,\nthe snow melted and became rivers.\nThe Sky Spirit broke off the end of his big walking stick he had\ncarried from the sky and threw the pieces in the water. The long\npieces became Beaver and Otter. The smaller pieces became fish.\nFrom the other end of his stick he made the animals.\nBiggest of all was Grizzly Bear. They were covered with fur and\nhad sharp claws just like today, but they co

RESHAPING THE DATA

In [32]:
main_folder_path = '/Users/francinasimone/Desktop/DreamApp/Starlight/First_People/'

tribe_folders = [f for f in os.listdir(main_folder_path) if os.path.isdir(os.path.join(main_folder_path, f))]

# This list will store reshaped data with columns: tribe, title, text
reshaped_data = []

for tribe in tribe_folders:
    tribe_folder_path = os.path.join(main_folder_path, tribe)
    csv_files = [f for f in os.listdir(tribe_folder_path) if f.endswith('.csv')]
    
    for csv_file in csv_files:
        df = pd.read_csv(os.path.join(tribe_folder_path, csv_file), header=None)
        
        for index, row in df.iterrows():
            content = row[0]  # Assuming data is in the first column
            # Splitting by newline and ensuring we have more than one segment to prevent errors
            segments = content.split('\n', 2)
            if len(segments) >= 3:
                title = segments[1].strip()
                text = segments[2].split('\n\nReturn to', 1)[0].strip()  # Removing footer
                reshaped_data.append([tribe, title, text])


# Convert reshaped_data into a DataFrame
final_df = pd.DataFrame(reshaped_data, columns=['Culture', 'Title', 'Text'])
final_df.set_index('Culture', inplace=True)  # Setting Tribe as index


In [33]:
#Viewing the Data Frame
final_df[['Title','Text']]

Unnamed: 0_level_0,Title,Text
Culture,Unnamed: 1_level_1,Unnamed: 2_level_1
Shasta,Grizzly,A Shasta Legend\nBefore people were on the Ear...
Shasta,How Coyote Stole Fire,"A Shasta Legend\nLong ago, when man was newly ..."
Shasta,Old Man Above and the Grizzlies,"A Shasta Legend\nAlong time ago, while smoke s..."
Shasta,How Old Man above created the World,"A Shasta Legend\nLong, long ago, when the worl..."
Shasta,How The People Got Arrowheads,A Shasta Legend\nIn the days when the first pe...
...,...,...
Pen_D_Oreille,Coyote and Mountain Sheep,"A Pen D'Oreille Legend\nCoyote was traveling, ..."
Pen_D_Oreille,In Idaho,"A Pen D'Oreille Legend\nNear Spokane one day, ..."
Pen_D_Oreille,Coyote and The Snake Monster,A Pen D'Oreille Legend\nThere was a huge rattl...
Snohomish,Pushing up the Sky,A Snohomish Legend\nThe Creator and Changer fi...


Handling Missing Values, Text Cleaning, and Duplicates

In [35]:
# Drop rows where any of the columns are NaN
final_df.dropna(inplace=True)

In [37]:
# Trim spaces
final_df['Text'] = final_df['Text'].str.strip()

# Remove any potential HTML tags
final_df['Text'] = final_df['Text'].str.replace('<[^<]+?>', '', regex=True)

In [38]:
final_df.drop_duplicates(subset=['Text'], inplace=True)


Final Check

In [39]:
final_df.head()

Unnamed: 0_level_0,Title,Text
Culture,Unnamed: 1_level_1,Unnamed: 2_level_1
Shasta,Grizzly,A Shasta Legend\nBefore people were on the Ear...
Shasta,How Coyote Stole Fire,"A Shasta Legend\nLong ago, when man was newly ..."
Shasta,Old Man Above and the Grizzlies,"A Shasta Legend\nAlong time ago, while smoke s..."
Shasta,How Old Man above created the World,"A Shasta Legend\nLong, long ago, when the worl..."
Shasta,How The People Got Arrowheads,A Shasta Legend\nIn the days when the first pe...
