# Check for Python and pip installation

In [10]:
# ========================================
# SETUP: Conda Environment & Package Installation
# ========================================

# STEP 1: Make sure you're in the correct conda environment
# Run these commands in your terminal (NOT in this notebook):
#   conda activate lemo
# 
# You should see "(lemo)" at the start of your terminal prompt

# STEP 2: Install or upgrade the package (if needed)
# Uncomment the line below and run this cell:


# ! pip install -U lemo-vocabulate

# STEP 3: Verify your setup
import sys
print("=" * 50)
print("ENVIRONMENT CHECK")
print("=" * 50)
print(f"Python location: {sys.executable}")
print(f"Python version: {sys.version.split()[0]}")

try:
    import lemo_vocabulate
    print(f"‚úÖ lemo-vocabulate installed: v{lemo_vocabulate.__version__}")
    print("\nYou're all set! Continue to the next cell.")
except ImportError:
    print("‚ùå lemo-vocabulate NOT found")
    print("\nTo fix this:")
    print("1. Make sure conda environment is activated if you're using one")
    print("2. Uncomment the pip install line above and run this cell again")
print("=" * 50)

ENVIRONMENT CHECK
Python location: /Users/sm9518/miniconda3/envs/test_lemo/bin/python
Python version: 3.8.20
‚úÖ lemo-vocabulate installed: v1.0.1

You're all set! Continue to the next cell.


# Load in Dependencies 

In [11]:
import pandas as pd
import numpy as np
import random  # do not alias
from lemo_vocabulate import run_vocabulate_analysis, get_data_path

# Load in Toy Dataset and Inspect

In [3]:
df = pd.read_csv("example_df.csv")
df.head(20)

Unnamed: 0,Filename,text
0,0,I am so angry and agitated!
1,1,I'm feeling really happy. Happy but also nervous.
2,2,It's been an emotional rollercoaster‚Ä¶
3,3,It was like a combo of anxiety/agitation.
4,4,I had a good day. :)
5,5,"I dislike disliking people, but I can't help b..."
6,6,"I felt bad about work, and I felt bad about my..."


# Run example analysis on pandas DataFrame and save to CSV

In [4]:
results = run_vocabulate_analysis(
    dict_file=get_data_path("AEV_Dict.csv"),
    input_data=df,
    text_column="text",
    stopwords_file=get_data_path("stopwords.txt"),  # Use bundled stopwords
    output_csv="test_vocabulate_output.csv",
)
results.head()

üîç Analyzing 7 text(s)...


Processing texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 11004.55text/s]

‚úÖ Results saved to test_vocabulate_output.csv
‚úÖ Analysis complete.





Unnamed: 0,Filename,text,WC,TC_Raw,TTR_Raw,TC_Clean,TTR_Clean,TC_NonDict,TTR_NonDict,DictPercent,...,Anger_Count,Anger_Unique,Sadness_CWR,Sadness_CCR,Sadness_Count,Sadness_Unique,NegUndiff_CWR,NegUndiff_CCR,NegUndiff_Count,NegUndiff_Unique
0,0,I am so angry and agitated!,6,7,100.0,2,100.0,0,0.0,28.57143,...,1,1,0.0,0,0,0,0.0,0.0,0,0
1,1,I'm feeling really happy. Happy but also nervous.,8,10,80.0,4,75.0,1,100.0,30.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
2,2,It's been an emotional rollercoaster‚Ä¶,5,6,100.0,3,100.0,2,100.0,16.66667,...,0,0,0.0,0,0,0,20.0,100.0,1,1
3,3,It was like a combo of anxiety/agitation.,7,10,100.0,4,100.0,2,100.0,20.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
4,4,I had a good day. :),6,7,100.0,3,100.0,3,100.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0


# Run analyses with custom stopwords

In [5]:
# use custom stopwords
stopwords = "the\nand\nis\nbe\nnot\n"
results = run_vocabulate_analysis(
   dict_file=get_data_path("AEV_Dict.csv"),
    input_data=df,
    text_column="text",
    stopwords_text=stopwords
)
results.head()

üîç Analyzing 7 text(s)...


Processing texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 8139.76text/s]

‚úÖ Analysis complete.





Unnamed: 0,Filename,text,WC,TC_Raw,TTR_Raw,TC_Clean,TTR_Clean,TC_NonDict,TTR_NonDict,DictPercent,...,Anger_Count,Anger_Unique,Sadness_CWR,Sadness_CCR,Sadness_Count,Sadness_Unique,NegUndiff_CWR,NegUndiff_CCR,NegUndiff_Count,NegUndiff_Unique
0,0,I am so angry and agitated!,6,7,100.0,6,100.0,4,100.0,28.57143,...,1,1,0.0,0,0,0,0.0,0.0,0,0
1,1,I'm feeling really happy. Happy but also nervous.,8,10,80.0,10,80.0,7,85.71429,30.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
2,2,It's been an emotional rollercoaster‚Ä¶,5,6,100.0,6,100.0,5,100.0,16.66667,...,0,0,0.0,0,0,0,20.0,100.0,1,1
3,3,It was like a combo of anxiety/agitation.,7,10,100.0,10,100.0,8,100.0,20.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
4,4,I had a good day. :),6,7,100.0,7,100.0,7,100.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0


# Run on .txt files in a folder

In [6]:
txt_results = run_vocabulate_analysis(
    dict_file=get_data_path("AEV_Dict.csv"),
    input_data="texts_to_analyze",
    stopwords_file=get_data_path("stopwords.txt"),  # Use bundled stopwords
    raw_counts=True,
    output_csv="output.csv"
)
txt_results.head()

üîç Analyzing 6 text(s)...


Processing texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 11715.93text/s]

‚úÖ Results saved to output.csv
‚úÖ Analysis complete.





Unnamed: 0,Filename,text,WC,TC_Raw,TTR_Raw,TC_Clean,TTR_Clean,TC_NonDict,TTR_NonDict,DictPercent,...,Anger_Count,Anger_Unique,Sadness_CWR,Sadness_CCR,Sadness_Count,Sadness_Unique,NegUndiff_CWR,NegUndiff_CCR,NegUndiff_Count,NegUndiff_Unique
0,text_2.txt,It‚Äôs been an emotional rollercoaster‚Ä¶,5,8,100.0,4,100.0,3,100.0,12.5,...,0,0,0.0,0,0,0,20.0,100.0,1,1
1,text_3.txt,It was like a combo of anxiety/agitation.,7,10,100.0,4,100.0,2,100.0,20.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
2,text_1.txt,I‚Äôm feeling really happy. Happy but also nervous.,8,12,83.33333,5,80.0,2,100.0,25.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0
3,text_0.txt,I am so angry and agitated!,6,7,100.0,2,100.0,0,0.0,28.57143,...,1,1,0.0,0,0,0,0.0,0.0,0,0
4,text_4.txt,I had a good day. :),6,7,100.0,3,100.0,3,100.0,0.0,...,0,0,0.0,0,0,0,0.0,0.0,0,0


In [9]:
df = pd.DataFrame({
    "user_id": ["user_1", "user_2"],
    "text": ["anxiety/sadness", "anxious-avodiant"]
})


test_results = run_vocabulate_analysis(
    dict_file=get_data_path("AEV_Dict.csv"),
    stopwords_file=get_data_path("stopwords.txt"),
    input_data=df,
    text_column="text",
    raw_counts=True)
test_results.head()

üîç Analyzing 2 text(s)...


Processing texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 14513.16text/s]

‚úÖ Analysis complete.





Unnamed: 0,Filename,text,WC,TC_Raw,TTR_Raw,TC_Clean,TTR_Clean,TC_NonDict,TTR_NonDict,DictPercent,...,Anger_Count,Anger_Unique,Sadness_CWR,Sadness_CCR,Sadness_Count,Sadness_Unique,NegUndiff_CWR,NegUndiff_CCR,NegUndiff_Count,NegUndiff_Unique
0,0,anxiety/sadness,2,3,100.0,2,100.0,0,0,66.66667,...,0,0,50.0,100.0,1,1,0.0,0,0,0
1,1,anxious-avodiant,1,1,100.0,1,100.0,0,0,100.0,...,0,0,0.0,0.0,0,0,0.0,0,0,0


In [None]:
# old method

df = pd.DataFrame({
    "user_id": ["user_1", "user_2"],
    "text": ["anxiety/sadness", "anxious-avodiant"]
})


test_results = run_vocabulate_analysis(
    dict_file=get_data_path("AEV_Dict.csv"),
    stopwords_file=get_data_path("stopwords.txt"),
    input_data=df,
    text_column="text", 
    whitespace_method='old',
    raw_counts=True)
test_results.head()

üîç Analyzing 2 text(s)...


Processing texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 12192.74text/s]

‚úÖ Analysis complete.





Unnamed: 0,Filename,text,WC,TC_Raw,TTR_Raw,TC_Clean,TTR_Clean,TC_NonDict,TTR_NonDict,DictPercent,...,Anger_Count,Anger_Unique,Sadness_CWR,Sadness_CCR,Sadness_Count,Sadness_Unique,NegUndiff_CWR,NegUndiff_CCR,NegUndiff_Count,NegUndiff_Unique
0,0,anxiety/sadness,1,3,100.0,2,100.0,0,0,66.66667,...,0,0,100.0,100.0,1,1,0.0,0,0,0
1,1,anxious-avodiant,1,1,100.0,1,100.0,0,0,100.0,...,0,0,0.0,0.0,0,0,0.0,0,0,0
