<a href="https://colab.research.google.com/github/AbeerProg/RRDS/blob/main/Generality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import spacy
from collections import Counter

# Load SpaCy English model
nlp = spacy.load('en_core_web_sm')

# Load the dataset
df = pd.read_excel('main.xlsx')

# Display the first few rows of the dataset
print(df.head())


                                                text
0  - New spot! Just opened this week.\n- Food is ...
1  - really good smoothies. Particularly the Pita...
2  !Great New York times.  You must order onion r...
3                          "FOOT LONG s..Delicious!!
4  "Hands down" Best coffee shop in Boro Park !!\...


In [None]:
def lexical_diversity(text):
    doc = nlp(text.lower())
    words = [token.text for token in doc if token.is_alpha]
    unique_words = set(words)
    if len(words) == 0:
        return 0.0
    diversity_score = len(unique_words) / len(words)
    return diversity_score

def count_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return len(entities)

def text_generality_score(text):
    lex_diversity = lexical_diversity(text)
    named_entity_count = count_named_entities(text)
    total_words = len(text.split())
    if total_words == 0:
        return 0.0
    generality_score = (1 - lex_diversity) + (1 / (named_entity_count + 1))  # Higher score for more general text
    return generality_score


In [None]:
# Apply the generality score function to each review in the DataFrame
df['generality_score'] = df['text'].apply(text_generality_score)

# Display the DataFrame with the new column
print(df[['text', 'generality_score']])

# Save the updated DataFrame to a new Excel file
df.to_excel('text_generality_output.xlsx', index=False)


                                                    text  generality_score
0      - New spot! Just opened this week.\n- Food is ...          0.655172
1      - really good smoothies. Particularly the Pita...          0.444444
2      !Great New York times.  You must order onion r...          0.500000
3                              "FOOT LONG s..Delicious!!          1.000000
4      "Hands down" Best coffee shop in Boro Park !!\...          0.500000
...                                                  ...               ...
22751  Zero stars if I could. Reporting this location...          0.477273
22752  Zero stars. Rating is not for the food because...          0.509259
22753  Zero to minimal wait line if you go at off pea...          0.333333
22754  Zoya working register is the slowest human bei...          1.062500
22755  Zuo Zongtang chicken and broccoli beef are my ...          0.545455

[22756 rows x 2 columns]
