<a href="https://colab.research.google.com/github/AbeerProg/RRDS/blob/main/wordsCount.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import spacy
import string

# Load SpaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset from the file
df = pd.read_excel('main.xlsx')



In [18]:
# Check if the column name 'Text' exists
if 'text' not in df.columns:
    print("Error: The DataFrame does not contain a 'Text' column.")
else:
    # Set of first-person singular pronouns to preserve
    first_person_singular_pronouns = {"i", "me", "my", "mine", "myself"}

    def preprocess_text(text):
        # Process the text with SpaCy
        doc = nlp(text.lower())  # Lowercase the text

        # Retain words, skip punctuation, and apply selective lemmatization
        processed_tokens = []
        for token in doc:
            if token.is_punct:
                continue
            if token.text in first_person_singular_pronouns:
                processed_tokens.append(token.text)  # Preserve the pronoun as is
            elif not token.is_stop:
                processed_tokens.append(token.lemma_)  # Lemmatize non-stop words

        # Join tokens back into a single string
        processed_text = ' '.join(processed_tokens)
        return processed_text

    def calculate_pronoun_ratio(text):
        if not text:
            return 0.0

        # Process the preprocessed text
        doc = nlp(text)

        total_words = len(doc)
        first_person_singular_count = sum(1 for token in doc if token.text.lower() in first_person_singular_pronouns)

        # Calculate the ratio
        ratio = first_person_singular_count / total_words if total_words > 0 else 0.0
        return ratio


In [19]:

    # Apply the preprocessing function to each review in the DataFrame
    df['preprocessed_text'] = df['text'].apply(preprocess_text)

    # Apply the pronoun ratio calculation to the preprocessed text
    df['FSpronoun_ratio'] = df['preprocessed_text'].apply(calculate_pronoun_ratio)

    # Display the DataFrame with the new columns
    print(df[['text', 'preprocessed_text', 'FSpronoun_ratio']])

    # Save the updated DataFrame to a new Excel file
    df.to_excel('FSP_output.xlsx', index=False)


                                                    text  \
0      - New spot! Just opened this week.\n- Food is ...   
1      - really good smoothies. Particularly the Pita...   
2      !Great New York times.  You must order onion r...   
3                              "FOOT LONG s..Delicious!!   
4      "Hands down" Best coffee shop in Boro Park !!\...   
...                                                  ...   
22751  Zero stars if I could. Reporting this location...   
22752  Zero stars. Rating is not for the food because...   
22753  Zero to minimal wait line if you go at off pea...   
22754  Zoya working register is the slowest human bei...   
22755  Zuo Zongtang chicken and broccoli beef are my ...   

                                       preprocessed_text  FSpronoun_ratio  
0      new spot open week \n food delicious i get chi...         0.069767  
1      good smoothie particularly pitaya kale smoothi...         0.000000  
2      great new york times   order onion ring deli