In [95]:
from typing import List, Dict
import warnings
warnings.filterwarnings("ignore", message="A parameter name that contains")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import metrics
from imblearn.over_sampling import SMOTE
import textstat
import joblib

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Elgabrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elgabrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Elgabrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Elgabrey\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Elgabrey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def load_and_prepare_data(path: str) -> pd.DataFrame:
    """
    Load dataset and rename columns.

    Args:
        path (str): File path to CSV dataset.

    Returns:
        pd.DataFrame: Prepared dataframe with columns ['text', 'label'].
    """
    data = pd.read_csv(path)
    data.rename(columns={'sentence': 'text', 'emotion': 'label'}, inplace=True)
    return data

data = load_and_prepare_data(r"D:\Grad_Proj\project\combined_emotion.csv")
data

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear
...,...,...
422741,i begun to feel distressed for you,fear
422742,i left feeling annoyed and angry thinking that...,anger
422743,i were to ever get married i d have everything...,joy
422744,i feel reluctant in applying there because i w...,fear


In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422746 entries, 0 to 422745
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    422746 non-null  object
 1   label   422746 non-null  object
dtypes: object(2)
memory usage: 6.5+ MB


In [98]:
data['No_of_Chars'] = data['text'].apply(len)
data['No_of_Words'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis= 1).apply(len)
data['No_of_Sents'] = data.apply(lambda row: nltk.sent_tokenize(row['text']), axis= 1).apply(len)

data.describe()

Unnamed: 0,No_of_Chars,No_of_Words,No_of_Sents
count,422746.0,422746.0,422746.0
mean,97.03398,19.220179,1.0
std,56.198156,11.057121,0.0
min,2.0,1.0,1.0
25%,54.0,11.0,1.0
50%,86.0,17.0,1.0
75%,128.0,25.0,1.0
max,830.0,178.0,1.0


In [99]:
def encode_labels(df: pd.DataFrame, col: str) -> (pd.DataFrame, LabelEncoder):
    """
    Encode categorical labels to integers.

    Args:
        df (pd.DataFrame): Input dataframe.
        col (str): Column name of labels.

    Returns:
        tuple: Dataframe with encoded labels, fitted LabelEncoder.
    """
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    return df, le

data, le = encode_labels(data, 'label')


In [100]:
""" cols_color = ['black', 'blue', 'red', 'green', 'purple', 'cyan']
plt.figure(figsize=(12,8))
fg = sns.countplot(x= data['label'], palette= cols_color)
fg.set_title('count plot of classes')
fg.set_xlabel('classes')
fg.set_ylabel('count of classes') """

" cols_color = ['black', 'blue', 'red', 'green', 'purple', 'cyan']\nplt.figure(figsize=(12,8))\nfg = sns.countplot(x= data['label'], palette= cols_color)\nfg.set_title('count plot of classes')\nfg.set_xlabel('classes')\nfg.set_ylabel('count of classes') "

In [101]:
""" plt.figure(figsize=(12,8))
fg = sns.pairplot(data= data, hue= 'label', palette= cols_color)
plt.show(fg) """

" plt.figure(figsize=(12,8))\nfg = sns.pairplot(data= data, hue= 'label', palette= cols_color)\nplt.show(fg) "

In [102]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [103]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    """
    تحويل POS tag من شكل NLTK إلى الشكل المطلوب من WordNetLemmatizer.
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # الافتراضي




In [104]:
def preprocess_text(text: str) -> str:
    """
    تنظيف النص مع تحسين lemmatization باستخدام POS tagging.
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
        if word not in stop_words
    ]
    return ' '.join(lemmatized_words)

data['lemmatized_words'] = data['text'].apply(preprocess_text)

In [105]:
""" cols_color = ['black', 'blue', 'red', 'green', 'purple', 'cyan']
plt.figure(figsize=(12,8))
fg = sns.countplot(x= data['label'], palette= cols_color)
fg.set_title('count plot of classes')
fg.set_xlabel('classes')
fg.set_ylabel('count of classes') """

" cols_color = ['black', 'blue', 'red', 'green', 'purple', 'cyan']\nplt.figure(figsize=(12,8))\nfg = sns.countplot(x= data['label'], palette= cols_color)\nfg.set_title('count plot of classes')\nfg.set_xlabel('classes')\nfg.set_ylabel('count of classes') "

In [106]:
""" all_words = " ".join(sentence for sentence in data['lemmatized_words'])
all_words

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show() """

' all_words = " ".join(sentence for sentence in data[\'lemmatized_words\'])\nall_words\n\nwordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)\n\nplt.figure(figsize=(12,8))\nplt.imshow(wordcloud, interpolation=\'bilinear\')\nplt.axis(\'off\')\nplt.show() '

In [107]:
tfidf = TfidfVectorizer(max_features=1000, max_df=0.9, min_df=2, stop_words='english', ngram_range=(1, 2))
sia = SentimentIntensityAnalyzer()

def vectorize_text(df: pd.DataFrame, text_col: str, vectorizer: TfidfVectorizer):
    """
    Vectorize text column using TF-IDF.

    Args:
        df (pd.DataFrame): Dataframe containing text data.
        text_col (str): Name of the column with preprocessed text.
        vectorizer (TfidfVectorizer): Initialized vectorizer.

    Returns:
        sparse matrix: TF-IDF features matrix.
    """
    X = vectorizer.fit_transform(df[text_col])
    return X

sampled_data = data.sample(frac=0.3, random_state=42)
X = vectorize_text(sampled_data, 'lemmatized_words', tfidf)
y = sampled_data['label']


In [108]:
def extract_additional_features(text: str) -> list:
    words = word_tokenize(text)
    num_words = len(words)
    num_chars = len(text)
    sentiment = sia.polarity_scores(text)['compound']
    readability = textstat.flesch_reading_ease(text)
    return [num_words, num_chars, sentiment, readability]

additional_features = np.array([extract_additional_features(text) for text in sampled_data['lemmatized_words']])
scaler = StandardScaler()
additional_features_scaled = scaler.fit_transform(additional_features)

In [109]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

X_combined = hstack([X, csr_matrix(additional_features_scaled)])


In [110]:
def extract_pos_tags(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return ' '.join(tag for word, tag in pos_tags)

sampled_data['pos_tags'] = sampled_data['lemmatized_words'].apply(extract_pos_tags)

tfidf_pos = TfidfVectorizer()
X_pos = tfidf_pos.fit_transform(sampled_data['pos_tags'])
sampled_data['pos_tags']

52840                      NN IN NN JJ NN VB JJ JJ NN NN NN
338919    VB JJ NN RB VBP RB JJ NN VBP RB JJ NN VBP JJ R...
4335                                              VBG NN NN
252892                        VB IN NN JJ NN NN VB JJ NN RB
334554               VB RB JJ JJ JJ NN NN JJ NN JJ NN NN NN
                                ...                        
31865                               NN IN JJ NN NN VB IN NN
72273                                           NN NN NN NN
138646                             JJ NN RB JJ RB VBP NN NN
158786                       NN NN RB JJ NN VB NN NN VBP NN
202847                                          NN RB JJ NN
Name: pos_tags, Length: 126824, dtype: object

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_balanced))


Before SMOTE: [14272 11980 34326  8252 28936  3693]
After SMOTE: [34326 34326 34326 34326 34326 34326]


In [112]:
from collections import Counter
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_balanced))


Before SMOTE: Counter({2: 34326, 4: 28936, 0: 14272, 1: 11980, 3: 8252, 5: 3693})
After SMOTE: Counter({2: 34326, 1: 34326, 0: 34326, 4: 34326, 5: 34326, 3: 34326})


In [113]:
row = X[0].toarray().flatten()
for i, val in enumerate(row):
    if val != 0:
        print(f'index {i}: {val}')


index 80: 0.5824834849061667
index 134: 0.5461345627470632
index 261: 0.08891625239402497
index 327: 0.25158305605751424
index 453: 0.4915822572203815
index 579: 0.22270334424009006


In [114]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    """
    Train Random Forest and evaluate on test set.

    Args:
        X_train, y_train: Training features and labels.
        X_test, y_test: Testing features and labels.

    Returns:
        model: Trained Random Forest model.
    """

    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    
    """  param_grid = {
    'n_estimators': [100, 200, 300],           # عدد الأشجار
    'max_depth': [None, 10, 20, 30],           # أقصى عمق للشجرة
    'min_samples_split': [2, 5, 10]             # أقل عدد عينات لتقسيم عقدة
    } """
    """  
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid,cv=3, n_jobs=-1, verbose=2, scoring='f1_weighted')
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    print("أفضل المعاملات:", grid_search.best_params_)
    print("أفضل نتيجة F1 (weighted):", grid_search.best_score_)
    
    best_rf_model = grid_search.best_estimator_ """

    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test set F1 Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(classification_report(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    """ cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show() """
    
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    return model

random_forest_model = train_and_evaluate(X_train_balanced, y_train_balanced, X_test, y_test)


Accuracy: 0.7797
Test set F1 Score (weighted): 0.7802
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      3568
           1       0.78      0.71      0.74      2995
           2       0.82      0.82      0.82      8582
           3       0.66      0.70      0.68      2063
           4       0.78      0.81      0.80      7234
           5       0.56      0.65      0.60       923

    accuracy                           0.78     25365
   macro avg       0.74      0.74      0.74     25365
weighted avg       0.78      0.78      0.78     25365

              precision    recall  f1-score   support

       anger       0.83      0.75      0.79      3568
        fear       0.78      0.71      0.74      2995
         joy       0.82      0.82      0.82      8582
        love       0.66      0.70      0.68      2063
         sad       0.78      0.81      0.80      7234
     suprise       0.56      0.65      0.60       923

    accuracy           

نفس الموديل بس بطريقة اسرع لأنه يجرب عينات عشوائية من المعاملات بدل كل التوليفات:

In [115]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    """
    Train Random Forest and evaluate on test set.

    Args:
        X_train, y_train: Training features and labels.
        X_test, y_test: Testing features and labels.

    Returns:
        model: Trained Random Forest model.
    """

    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    
    """  param_grid = {
    'n_estimators': [100, 200, 300],           # عدد الأشجار
    'max_depth': [None, 10, 20, 30],           # أقصى عمق للشجرة
    'min_samples_split': [2, 5, 10]             # أقل عدد عينات لتقسيم عقدة
    } """
    """  
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid,cv=3, n_jobs=-1, verbose=2, scoring='f1_weighted')
    grid_search.fit(X_train_balanced, y_train_balanced)
    
    print("أفضل المعاملات:", grid_search.best_params_)
    print("أفضل نتيجة F1 (weighted):", grid_search.best_score_)
    
    best_rf_model = grid_search.best_estimator_ """

    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Test set F1 Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(classification_report(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    """ cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show() """
    
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    return model

random_forest_model = train_and_evaluate(X_train_balanced, y_train_balanced, X_test, y_test)


Accuracy: 0.7797
Test set F1 Score (weighted): 0.7802
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      3568
           1       0.78      0.71      0.74      2995
           2       0.82      0.82      0.82      8582
           3       0.66      0.70      0.68      2063
           4       0.78      0.81      0.80      7234
           5       0.56      0.65      0.60       923

    accuracy                           0.78     25365
   macro avg       0.74      0.74      0.74     25365
weighted avg       0.78      0.78      0.78     25365

              precision    recall  f1-score   support

       anger       0.83      0.75      0.79      3568
        fear       0.78      0.71      0.74      2995
         joy       0.82      0.82      0.82      8582
        love       0.66      0.70      0.68      2063
         sad       0.78      0.81      0.80      7234
     suprise       0.56      0.65      0.60       923

    accuracy           

In [116]:
""" from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(5, 31, 5)),
    'min_samples_split': randint(2, 11)
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring='f1_weighted', random_state=42)

random_search.fit(X_train_balanced, y_train_balanced)

print("أفضل المعاملات:", random_search.best_params_)
print("أفضل نتيجة F1 (weighted):", random_search.best_score_)

best_rf_model = random_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
print(f"Test set F1 Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(classification_report(y_test, y_pred))
 """

' from sklearn.model_selection import RandomizedSearchCV\nfrom scipy.stats import randint\n\nparam_dist = {\n    \'n_estimators\': randint(100, 500),\n    \'max_depth\': [None] + list(range(5, 31, 5)),\n    \'min_samples_split\': randint(2, 11)\n}\n\nrf = RandomForestClassifier(random_state=42)\n\nrandom_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,\n                                   n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring=\'f1_weighted\', random_state=42)\n\nrandom_search.fit(X_train_balanced, y_train_balanced)\n\nprint("أفضل المعاملات:", random_search.best_params_)\nprint("أفضل نتيجة F1 (weighted):", random_search.best_score_)\n\nbest_rf_model = random_search.best_estimator_\n\ny_pred = best_rf_model.predict(X_test)\nprint(f"Test set F1 Score (weighted): {f1_score(y_test, y_pred, average=\'weighted\'):.4f}")\nprint(classification_report(y_test, y_pred))\n '

In [117]:
""" importances = random_forest_model.feature_importances_
indices = np.argsort(importances)[-20:]  # أهم 20
plt.figure(figsize=(10,6))
plt.barh(range(len(indices)), importances[indices])
plt.title("Top 20 Feature Importances")
plt.show()
 """

' importances = random_forest_model.feature_importances_\nindices = np.argsort(importances)[-20:]  # أهم 20\nplt.figure(figsize=(10,6))\nplt.barh(range(len(indices)), importances[indices])\nplt.title("Top 20 Feature Importances")\nplt.show()\n '

In [118]:
""" models = {
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"{name} F1 Score: {f1_score(y_test, y_pred, average='weighted'):.2f}")
    print("-" * 30)
 """

' models = {\n    \'Random Forest\': RandomForestClassifier(),\n    \'Naive Bayes\': MultinomialNB(),\n    \'SVM\': SVC(),\n    \'KNN\': KNeighborsClassifier()\n}\n\nfor name, model in models.items():\n    model.fit(X_train_balanced, y_train_balanced)\n    y_pred = model.predict(X_test)\n    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")\n    print(f"{name} F1 Score: {f1_score(y_test, y_pred, average=\'weighted\'):.2f}")\n    print("-" * 30)\n '

In [119]:
def predict_emotion(text, model, vectorizer, label_encoder, scaler):
    processed = preprocess_text(text)
    
    # تحويل النص إلى TF-IDF
    vect_text = vectorizer.transform([processed])
    
    # حساب الميزات الإضافية
    additional_feats = np.array([extract_additional_features(processed)])
    additional_feats_scaled = scaler.transform(additional_feats)
    
    # دمج الميزات
    from scipy.sparse import hstack, csr_matrix
    combined_features = hstack([vect_text, csr_matrix(additional_feats_scaled)])
    
    # التنبؤ
    prediction = model.predict(combined_features)
    return label_encoder.inverse_transform(prediction)[0]


# مثال تجربة
print(predict_emotion("I feel so happy and joyful today!", random_forest_model, tfidf, le, scaler))


joy


In [120]:
from collections import Counter
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_balanced))


Before SMOTE: Counter({2: 34326, 4: 28936, 0: 14272, 1: 11980, 3: 8252, 5: 3693})
After SMOTE: Counter({2: 34326, 1: 34326, 0: 34326, 4: 34326, 5: 34326, 3: 34326})


In [121]:
# تحميل قاعدة بيانات التوصية
books_db = pd.read_csv("books.csv", index_col='book_id')
books_db = books_db[['original_title', 'authors']]
books_db.rename(columns={'original_title': 'title'}, inplace=True)
books_db




Unnamed: 0_level_0,title,authors
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2767052,The Hunger Games,Suzanne Collins
3,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré"
41865,Twilight,Stephenie Meyer
2657,To Kill a Mockingbird,Harper Lee
4671,The Great Gatsby,F. Scott Fitzgerald
...,...,...
7130616,Bayou Moon,Ilona Andrews
208324,Means of Ascent,Robert A. Caro
77431,The Mauritius Command,Patrick O'Brian
8565083,Cinderella Ate My Daughter: Dispatches from th...,Peggy Orenstein


In [122]:
books_db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 2767052 to 8914
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9415 non-null   object
 1   authors  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [123]:
books_db.dropna(inplace=True)

In [124]:
# تحميل داتاست المقالات
articles_db = pd.read_csv("articles.csv", index_col='id')  # يحتوي على title, link
articles_db


Unnamed: 0_level_0,url,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,https://towardsdatascience.com/not-all-rainbow...,Not All Rainbows and Sunshine: The Darker Side...
2,https://towardsdatascience.com/ethics-in-ai-po...,Ethics in AI: Potential Root Causes for Biased...
3,https://towardsdatascience.com/python-tuple-th...,"Python Tuple, The Whole Truth and Only the Tru..."
4,https://towardsdatascience.com/dates-and-subqu...,Dates and Subqueries in SQL
5,https://towardsdatascience.com/temporal-differ...,Temporal Differences with Python: First Sample...
...,...,...
2494,https://medium.com/swlh/brian-chesky-is-an-exa...,Brian Chesky is an Example of What it Means to...
2495,https://medium.com/swlh/5-red-flags-of-online-...,5 Red Flags of Online Business Gurus
2496,https://writingcooperative.com/recognizing-the...,Recognizing These Three Realities Can Help Set...
2497,https://writingcooperative.com/i-remember-it-l...,“I Remember It Like It Was Just Yesterday…” Re...


In [125]:
articles_db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2498 entries, 1 to 2498
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     2498 non-null   object
 1   title   2498 non-null   object
dtypes: object(2)
memory usage: 58.5+ KB


In [126]:
# تحميل النموذج و TF-IDF
model = joblib.load("./models/random_forest_model.pkl")
vectorizer = joblib.load("./models/tfidf_vectorizer.pkl")

In [127]:
from scipy.sparse import hstack, csr_matrix
from typing import List
from sklearn.base import BaseEstimator

def classify_emotions(
    df: pd.DataFrame,
    text_column: str,
    model: BaseEstimator,
    vectorizer: TfidfVectorizer,
    scaler: StandardScaler,
    le: LabelEncoder,
    positive_emotions: List[str] = ["joy", "love", "surprise"],
    save_path: str = None
) -> pd.DataFrame:
    """
    يصنف المشاعر في عمود نصي ويُرجع فقط الصفوف ذات المشاعر الإيجابية.

    Args:
        df (pd.DataFrame): داتا فريم يحتوي على النصوص.
        text_column (str): اسم العمود الذي يحتوي على النصوص (مثل: "title" أو "book_title").
        model (BaseEstimator): نموذج تصنيف مدرب.
        vectorizer (TfidfVectorizer): محول TF-IDF.
        scaler (StandardScaler): مقياس الميزات الإضافية.
        le (LabelEncoder): مشفر العواطف.
        positive_emotions (List[str], optional): قائمة بالعواطف الإيجابية للاحتفاظ بها.
        save_path (str, optional): إذا تم توفيره، سيتم حفظ الداتا فريم النهائي في هذا المسار كملف CSV.

    Returns:
        pd.DataFrame: داتا فريم يحتوي على النصوص المصنفة ذات العواطف الإيجابية.
    """

    # تحقق من وجود العمود
    if text_column not in df.columns:
        raise ValueError(f"❌ العمود '{text_column}' غير موجود في الداتا فريم.")

    # تنظيف النصوص
    df['clean_text'] = df[text_column].apply(preprocess_text)

    # استخراج ميزات TF-IDF
    X_text = vectorizer.transform(df['clean_text'])

    # استخراج الميزات الإضافية
    additional_features = np.array([
        extract_additional_features(text) for text in df['clean_text']
    ])
    additional_scaled = scaler.transform(additional_features)

    # دمج الميزات
    combined_features = hstack([X_text, csr_matrix(additional_scaled)])

    # التنبؤ بالعاطفة
    predicted = model.predict(combined_features)
    df['emotion'] = le.inverse_transform(predicted)

    # الاحتفاظ فقط بالعواطف الإيجابية
    df_filtered = df[df['emotion'].isin(positive_emotions)]


    # حفظ إذا تم تحديد مسار
    if save_path:
        df_filtered.to_csv(save_path, index=False)

    return df_filtered


In [128]:
try:
    df_clean = classify_emotions(
        df=books_db,
        text_column="title",
        model=model,
        vectorizer=vectorizer,
        scaler=scaler,
        le=le,
        save_path="classified_books.csv"
    )
except ValueError as e:
    print(str(e))


In [129]:
try:
    df_clean = classify_emotions(
        df=articles_db,
        text_column="title",
        model=model,
        vectorizer=vectorizer,
        scaler=scaler,
        le=le,
        save_path="classified_articles.csv"
    )
except ValueError as e:
    print(str(e))


In [130]:
def filter_by_keywords(df: pd.DataFrame, include_keywords=None, exclude_keywords=None) -> pd.DataFrame:
    """
    ترشيح المحتوى بناءً على كلمات مفتاحية.
    
    Args:
        df (pd.DataFrame): جدول المحتوى.
        include_keywords (list): كلمات يجب أن يحتويها النص.
        exclude_keywords (list): كلمات يجب ألا يحتويها النص.
    
    Returns:
        pd.DataFrame: محتوى بعد التصفية.
    """
    if include_keywords:
        pattern = '|'.join(include_keywords)
        df = df[df['title'].str.contains(pattern, case=False, na=False)]
    
    if exclude_keywords:
        pattern = '|'.join(exclude_keywords)
        df = df[~df['title'].str.contains(pattern, case=False, na=False)]

    return df

In [131]:
mood_map = {
    'sadness': ['joy', 'surprise'],
    'anger': ['love', 'joy'],
    'fear': ['love', 'joy'],
    'joy': ['joy','surprise', 'love'],
    'surprise': ['love', 'joy'],
    'love': ['joy', 'love', 'surprise']
    }

In [133]:
""" 
def recommend_content_filtered(
    user_text: str,
    model,
    vectorizer,
    label_encoder,
    rec_db: pd.DataFrame,
    include_keywords=None,
    exclude_keywords=None
) -> pd.DataFrame:
"""
#توصية بمحتوى مع فلترة اختيارية بناءً على الكلمات المفتاحية.
"""
    # تنظيف أسماء الأعمدة من المسافات الزائدة
    rec_db.columns = rec_db.columns.str.strip()

    # التأكد من وجود الأعمدة المطلوبة
    if 'predicted_emotion' not in rec_db.columns or 'original_title' not in rec_db.columns:
        raise KeyError("⚠️ تأكد أن الداتا تحتوي على الأعمدة: 'original_title' و 'predicted_emotion'.")

    # التنبؤ بالمشاعر للنص
    user_emotion = predict_emotion(user_text, model, vectorizer, label_encoder)
    print(f"🔍 Detected Emotion: {user_emotion}")

    # تحديد المشاعر المستهدفة بناءً على mood_map
    target_emotions = mood_map.get(user_emotion, ['joy', 'calm', 'confidence'])

    # ترشيح المحتوى بناءً على المشاعر
    recommended = rec_db[rec_db['predicted_emotion'].isin(target_emotions)]

    # فلترة المحتوى بناءً على الكلمات
    filtered = filter_by_keywords(recommended, include_keywords, exclude_keywords)

    # في حالة عدم وجود نتائج بعد الفلترة
    if filtered.empty:
        print("⚠️ لا يوجد محتوى يلبي معايير الفلترة. عرض نتائج بدون فلترة.")
        return recommended[['original_title', 'authors','predicted_emotion']].sample(min(3, len(recommended)))

    return filtered[['original_title', 'authors', 'predicted_emotion']].sample(min(3, len(filtered)))

 """

'\n    # تنظيف أسماء الأعمدة من المسافات الزائدة\n    rec_db.columns = rec_db.columns.str.strip()\n\n    # التأكد من وجود الأعمدة المطلوبة\n    if \'predicted_emotion\' not in rec_db.columns or \'original_title\' not in rec_db.columns:\n        raise KeyError("⚠️ تأكد أن الداتا تحتوي على الأعمدة: \'original_title\' و \'predicted_emotion\'.")\n\n    # التنبؤ بالمشاعر للنص\n    user_emotion = predict_emotion(user_text, model, vectorizer, label_encoder)\n    print(f"🔍 Detected Emotion: {user_emotion}")\n\n    # تحديد المشاعر المستهدفة بناءً على mood_map\n    target_emotions = mood_map.get(user_emotion, [\'joy\', \'calm\', \'confidence\'])\n\n    # ترشيح المحتوى بناءً على المشاعر\n    recommended = rec_db[rec_db[\'predicted_emotion\'].isin(target_emotions)]\n\n    # فلترة المحتوى بناءً على الكلمات\n    filtered = filter_by_keywords(recommended, include_keywords, exclude_keywords)\n\n    # في حالة عدم وجود نتائج بعد الفلترة\n    if filtered.empty:\n        print("⚠️ لا يوجد محتوى يلبي معايي

In [134]:
def recommend_books(emotion: str, top_n: int = 5) -> List[Dict]:
    df = pd.read_csv("classified_books.csv")  # يحتوي على title, link

    # تصنيف المقالات إذا لسه ما اتصنفتش
    if "emotion" not in df.columns:
        df["emotion"] = df["title"].apply(predict_emotion)

    # حدد المشاعر اللي ممكن تحسن المزاج
    target_emotions = mood_map.get(emotion, ["joy", "love", "surprise"])

    # رشّح المقالات اللي مشاعرها إيجابية ومناسبة لتحسين المزاج
    recommended = df[df["emotion"].isin(target_emotions)].head(top_n)

    return recommended[["title", "authors", "emotion"]].to_dict(orient="records")


In [135]:
def recommend_articles(emotion: str, top_n: int = 5) -> List[Dict]:
    df = pd.read_csv("classified_articles.csv")  # يحتوي على title, link

    # تصنيف المقالات إذا لسه ما اتصنفتش
    if "emotion" not in df.columns:
        df["emotion"] = df["title"].apply(predict_emotion)

    # حدد المشاعر اللي ممكن تحسن المزاج
    target_emotions = mood_map.get(emotion, ["joy", "love", "surprise"])

    # رشّح المقالات اللي مشاعرها إيجابية ومناسبة لتحسين المزاج
    recommended = df[df["emotion"].isin(target_emotions)].head(top_n)

    return recommended[["title", "url", "emotion"]].to_dict(orient="records")


In [136]:
def recommend_content(emotion: str, top_n: int = 5) -> Dict[str, List[Dict]]:
    books = recommend_books(emotion)[:top_n]
    articles = recommend_articles(emotion)[:top_n]
    return {
        "books": books,
        "articles": articles
    }


In [137]:
results = recommend_content(emotion="sad")

print("Books:")
for item in results.get('books', []):
    print(f"- Title: {item.get('title')}, Authors: {item.get('authors')}, Emotion: {item.get('emotion')}")

print("\nArticles:")
for item in results.get('articles', []):
    print(f"- Title: {item.get('title')}, Link: {item.get('link')}, Emotion: {item.get('emotion')}")


Books:
- Title: The Great Gatsby, Authors: F. Scott Fitzgerald, Emotion: joy
- Title: The Hobbit or There and Back Again, Authors: J.R.R. Tolkien, Emotion: joy
- Title: The Catcher in the Rye, Authors: J.D. Salinger, Emotion: joy
- Title: Angels & Demons , Authors: Dan Brown, Emotion: joy
- Title: The Kite Runner , Authors: Khaled Hosseini, Emotion: joy

Articles:
- Title: Not All Rainbows and Sunshine: The Darker Side of ChatGPT, Link: None, Emotion: joy
- Title: Python Tuple, The Whole Truth and Only the Truth: Let’s Dig Deep, Link: None, Emotion: joy
- Title: Temporal Differences with Python: First Sample-Based Reinforcement Learning Algorithm, Link: None, Emotion: joy
- Title: 10 Subtle Strategies I Wish I Knew When I Had 23 Email Subscribers and Made $0 Online, Link: None, Emotion: joy
- Title: Don’t Become a Full-Time Content Creator If You Have Low-Risk Tolerance, Link: None, Emotion: joy


In [138]:
results = recommend_books(emotion="sad")
for book in results:
    print(book)

{'title': 'The Great Gatsby', 'authors': 'F. Scott Fitzgerald', 'emotion': 'joy'}
{'title': 'The Hobbit or There and Back Again', 'authors': 'J.R.R. Tolkien', 'emotion': 'joy'}
{'title': 'The Catcher in the Rye', 'authors': 'J.D. Salinger', 'emotion': 'joy'}
{'title': 'Angels & Demons ', 'authors': 'Dan Brown', 'emotion': 'joy'}
{'title': 'The Kite Runner ', 'authors': 'Khaled Hosseini', 'emotion': 'joy'}


In [139]:
results = recommend_articles(emotion="sad")
for article in results:
    print(article)


{'title': 'Not All Rainbows and Sunshine: The Darker Side of\xa0ChatGPT', 'url': 'https://towardsdatascience.com/not-all-rainbows-and-sunshine-the-darker-side-of-chatgpt-75917472b9c', 'emotion': 'joy'}
{'title': 'Python Tuple, The Whole Truth and Only the Truth: Let’s Dig\xa0Deep', 'url': 'https://towardsdatascience.com/python-tuple-the-whole-truth-and-only-truth-lets-dig-deep-24d2bf02971b', 'emotion': 'joy'}
{'title': 'Temporal Differences with Python: First Sample-Based Reinforcement Learning Algorithm', 'url': 'https://towardsdatascience.com/temporal-differences-with-python-first-sample-based-reinforcement-learning-algorithm-54c11745a0ee', 'emotion': 'joy'}
{'title': '10 Subtle Strategies I Wish I Knew When I Had 23 Email Subscribers and Made $0\xa0Online', 'url': 'https://medium.com/swlh/10-subtle-strategies-i-wish-i-knew-when-i-had-23-email-subscribers-and-made-0-online-3eb65c335060', 'emotion': 'joy'}
{'title': 'Don’t Become a Full-Time Content Creator If You Have Low-Risk Tolera

In [None]:
import joblib

joblib.dump(random_forest_model, './models/random_forest_model.pkl')
joblib.dump(tfidf, './models/tfidf_vectorizer.pkl')
joblib.dump(le, './models/label_encoder.pkl')
joblib.dump(scaler, './models/scaler.pkl')

['./models/scaler.pkl']