In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from logic.processing import load_data, preproc
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import SGDClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Flotchi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Flotchi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def data_filter(df,uselfCount=0,min_length=30):
    df = df.dropna()
    df = df[df['rating'].isin([1,2,3,8,9,10])]
    df['sentiment'] = 0
    df['sentiment'] = (df['rating'].isin([8,9,10])).astype(int)
    df = df[df['usefulCount']>uselfCount]
    df['review_length'] = df['review'].apply(lambda x: len(str(x).split()))
    df = df[df.review_length>=min_length]
    return df

In [3]:
def data_filter2(df,uselfCount=0,min_length=20):
    df = df.dropna()
    df['sentiment'] = df['rating'].apply(lambda x: 'Good' if x >= 4 else 'Bad')
    df = df[df['usefulCount']>uselfCount]
    df['review_length'] = df['review'].apply(lambda x: len(str(x).split()))
    df = df[df.review_length>=min_length]
    X = df[['review']] 
    y = df['sentiment'] 

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X, y)
    

    df_resampled = pd.DataFrame(X_resampled, columns=['review'])
    df_resampled['sentiment'] = y_resampled
    
    return df_resampled
    return df

In [4]:
def balance_dataset(X, y):
    
    df = pd.DataFrame({'review': X, 'sentiment': y})

    good_reviews = df[df['sentiment'] == 'Good']
    bad_reviews = df[df['sentiment'] == 'Bad']
    

    min_size = min(len(good_reviews), len(bad_reviews))
    

    good_balanced = good_reviews.sample(n=min_size, random_state=42)
    bad_balanced = bad_reviews.sample(n=min_size, random_state=42)
    

    balanced_df = pd.concat([good_balanced, bad_balanced]).sample(frac=1, random_state=42)
    

    X_balanced = balanced_df['review']
    y_balanced = balanced_df['sentiment']
    
    return X_balanced, y_balanced

In [22]:
df_train = load_data('drugsComTrain_raw.csv')
df_test = load_data('drugsComTest_raw.csv')

In [23]:
df_train_filter = data_filter2(df_train)
df_test_filter = data_filter2(df_test)

In [24]:
df_train_prep = preproc(df_train_filter)
df_test_prep = preproc(df_test_filter)

In [8]:
X_train = df_train_prep['clean']
y_train = df_train_prep['sentiment']

In [9]:
X_test = df_test_prep['clean']
y_test = df_test_prep['sentiment']

In [10]:
X_train_balanced, y_train_balanced = balance_dataset(X_train,y_train)

In [11]:
y_train_balanced.value_counts()

Series([], Name: count, dtype: int64)

In [12]:
pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer(max_df=0.75, min_df=5, ngram_range=(1, 2))),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('SGD', SGDClassifier(loss='hinge', penalty='l2', alpha=1/10, max_iter=100))
])

In [13]:
pipeline2 = make_pipeline(
    TfidfVectorizer(max_df=0.75, min_df=5, ngram_range=(1, 2)),
    SGDClassifier(loss='hinge', penalty='l2', alpha=1/1000, max_iter=1000, class_weight='balanced')
)

In [14]:
pipeline2.fit(X_train_balanced,y_train_balanced)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
y_pred_labels = ['Good' if pred == 1 else 'Bad' for pred in y_pred]

In [None]:
print(classification_report(y_test, y_pred_labels))