Purpose

Load raw app review CSVs, clean them for EDA, and save cleaned versions.

In [1]:
import os
print("Current working directory:", os.getcwd())

Current working directory: d:\PM\productivity-friction-insight-report\notebooks


In [3]:
import pandas as pd
import os

# Absolute paths for reproducibility
DATA_DIR = r"D:\PM\productivity-friction-insight-report\data"
CLEAN_DIR = os.path.join(DATA_DIR, "cleaned")

# Make cleaned folder if it does not exist
os.makedirs(CLEAN_DIR, exist_ok=True)

print("Data dir:", DATA_DIR)
print("Cleaned data dir:", CLEAN_DIR)


Data dir: D:\PM\productivity-friction-insight-report\data
Cleaned data dir: D:\PM\productivity-friction-insight-report\data\cleaned


In [4]:
# Only app review CSVs
files = [f for f in os.listdir(DATA_DIR) if f.endswith("_reviews_sample.csv")]
print("Found CSV files:", files)


Found CSV files: ['notion_reviews_sample.csv', 'slack_reviews_sample.csv', 'trello_reviews_sample.csv']


In [5]:
# Columns we care about
CLEAN_COLUMNS = ["score", "content", "at"]

for file in files:
    app_name = file.split("_reviews")[0]
    print(f"Loading {app_name} from {file}...")

    # Load CSV
    df = pd.read_csv(os.path.join(DATA_DIR, file))

    # Keep relevant columns only
    df = df[CLEAN_COLUMNS]

    # Drop duplicates
    df = df.drop_duplicates(subset=["content"])

    # Normalize timestamps to datetime
    df["at"] = pd.to_datetime(df["at"], errors="coerce")

    # Save cleaned CSV
    out_file = os.path.join(CLEAN_DIR, f"{app_name}_reviews_cleaned.csv")
    df.to_csv(out_file, index=False)

    print(f"{app_name}: cleaned {len(df)} rows -> saved to {out_file}")


Loading notion from notion_reviews_sample.csv...
notion: cleaned 20 rows -> saved to D:\PM\productivity-friction-insight-report\data\cleaned\notion_reviews_cleaned.csv
Loading slack from slack_reviews_sample.csv...
slack: cleaned 20 rows -> saved to D:\PM\productivity-friction-insight-report\data\cleaned\slack_reviews_cleaned.csv
Loading trello from trello_reviews_sample.csv...
trello: cleaned 20 rows -> saved to D:\PM\productivity-friction-insight-report\data\cleaned\trello_reviews_cleaned.csv


In [6]:
# Check cleaned files
cleaned_files = [f for f in os.listdir(CLEAN_DIR) if f.endswith("_cleaned.csv")]
print("Cleaned CSV files:", cleaned_files)

# Load one file to inspect
df_sample = pd.read_csv(os.path.join(CLEAN_DIR, cleaned_files[0]))
print(df_sample.head())


Cleaned CSV files: ['notion_reviews_cleaned.csv', 'slack_reviews_cleaned.csv', 'trello_reviews_cleaned.csv']
   score                                            content  \
0      5                                                 ok   
1      5                                              great   
2      1  I would have given less stars but one star is ...   
3      5                              A very wonderful tool   
4      1  if exporting all the documents at once was fre...   

                    at  
0  2025-12-02 16:08:30  
1  2025-12-02 16:06:30  
2  2025-12-02 15:17:49  
3  2025-12-02 14:55:50  
4  2025-12-02 10:24:20  
