In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix

df = pd.read_csv('total.csv')

df_sample = df.sample(frac=1)

corpus_sample = df_sample['review'].tolist()

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features= 1000)

tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_sample)

tfidf_matrix_sparse = csr_matrix(tfidf_matrix)


In [5]:
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix_sparse, columns=feature_names)

In [6]:
print(df.shape)

(212698, 12)


In [7]:
print(df)

            id  drugName  condition  \
0       163740      2156        193   
1       206473      2067        173   
2       159672       402        767   
3        39293       827        789   
4        97768       869         95   
...        ...       ...        ...   
212693  191035       593         20   
212694  127085      2103        472   
212695  187382      2429        646   
212696   47128      3224        761   
212697  215220      1961        163   

                                                   review  rating        date  \
0       "I've tried a few antidepressants over the yea...    10.0  2012-02-28   
1       "My son has Crohn's disease and has done very ...     8.0  2009-05-17   
2                           "Quick reduction of symptoms"     9.0  2017-09-29   
3       "Contrave combines drugs that were used for al...     9.0  2017-03-05   
4       "I have been on this birth control for one cyc...     9.0  2015-10-22   
...                                          

In [8]:
tfidf_df['sentiment_score'] = df['sentiment_score']
tfidf_df['condition'] = df['condition']
tfidf_df['rating'] = df['rating']
tfidf_df['scaled_usefulCount'] = df['scaled_usefulCount']
tfidf_df['rating_scaled_usefulCount'] = df['rating_scaled_usefulCount']

In [9]:
# tfidf_df = pd.read_csv("tlfdf.csv")

In [10]:
from sklearn.model_selection import train_test_split

X = tfidf_df.drop([ 'rating_scaled_usefulCount', 'scaled_usefulCount','rating'], axis=1)  # Features
y = tfidf_df['rating_scaled_usefulCount']
print(X)


              10  100  10mg        11        12   14        15   16        18  \
0       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
1       0.000000  0.0   0.0  0.139285  0.000000  0.0  0.114037  0.0  0.000000   
2       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
3       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
4       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.171079  0.0  0.000000   
...          ...  ...   ...       ...       ...  ...       ...  ...       ...   
212693  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.106295   
212694  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
212695  0.000000  0.0   0.0  0.000000  0.136148  0.0  0.000000  0.0  0.000000   
212696  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
212697  0.084093  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   

         20  ...       yet 

In [11]:
tfidf_df['id'] = df['id']

In [12]:
from sklearn.preprocessing import MinMaxScaler

y = y.values.reshape(-1, 1) if len(y.shape) == 1 else y

scaler = MinMaxScaler()

# Fit and transform the data
y_scaled = scaler.fit_transform(y)

print(y_scaled)


[[0.01704105]
 [0.12252252]
 [0.00661765]
 ...
 [0.02651515]
 [0.24159021]
 [0.71020408]]


In [13]:
yy = (y_scaled > 0.5).astype(int)
print(yy)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [1]]


In [14]:
import numpy as np

counts = np.bincount(yy.flatten())

# Print the counts
print("Number of 0s:", counts[0])
print("Number of 1s:", counts[1])


Number of 0s: 206775
Number of 1s: 5923


In [15]:
tfidf_df['rating_scaled_usefulCount'] = yy

In [16]:
print(X)
print(tfidf_df['rating_scaled_usefulCount'])
# Assuming 'rating_scaled_usefulCount' is your target variable
y = tfidf_df['rating_scaled_usefulCount']

# Count the occurrences of each value
class_distribution = y.value_counts()

# Display the class distribution
print("Class distribution in 'rating_scaled_usefulCount':")
print(class_distribution)


              10  100  10mg        11        12   14        15   16        18  \
0       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
1       0.000000  0.0   0.0  0.139285  0.000000  0.0  0.114037  0.0  0.000000   
2       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
3       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
4       0.000000  0.0   0.0  0.000000  0.000000  0.0  0.171079  0.0  0.000000   
...          ...  ...   ...       ...       ...  ...       ...  ...       ...   
212693  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.106295   
212694  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
212695  0.000000  0.0   0.0  0.000000  0.136148  0.0  0.000000  0.0  0.000000   
212696  0.000000  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   
212697  0.084093  0.0   0.0  0.000000  0.000000  0.0  0.000000  0.0  0.000000   

         20  ...       yet 

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X = tfidf_df.drop(['rating_scaled_usefulCount', 'scaled_usefulCount', 'rating'], axis=1)
y = tfidf_df['rating_scaled_usefulCount']

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X, y)





In [18]:
print(y_train_resampled)

0         0
1         0
2         0
3         0
4         0
         ..
413545    1
413546    1
413547    1
413548    1
413549    1
Name: rating_scaled_usefulCount, Length: 413550, dtype: int32


In [19]:
print(y_train_resampled.value_counts())

rating_scaled_usefulCount
0    206775
1    206775
Name: count, dtype: int64


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.3, random_state=42, stratify=y_train_resampled
)

In [21]:
X_train_no_id = X_train.drop(['id'], axis=1)
X_test_no_id = X_test.drop(['id'], axis = 1 )

In [28]:
print(X_train_no_id)
print(y_train.shape)

              10  100  10mg        11   12   14   15        16   18   20  ...  \
69191   0.069345  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
198219  0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
56512   0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
407528  0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
304921  0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
...          ...  ...   ...       ...  ...  ...  ...       ...  ...  ...  ...   
121232  0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
223988  0.108511  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
392839  0.021815  0.0   0.0  0.000000  0.0  0.0  0.0  0.194667  0.0  0.0  ...   
162831  0.000000  0.0   0.0  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  ...   
289006  0.076248  0.0   0.0  0.117514  0.0  0.0  0.0  0.000000  0.0  0.0  ...   

        yet       you  you 

In [29]:
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assuming you already have X_train_no_id and y_train

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_no_id, y_train)
y_pred_logreg = logreg.predict(X_test_no_id)
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))

# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train_no_id, y_train)
y_pred_perceptron = perceptron.predict(X_test_no_id)
print("Perceptron:")
print(classification_report(y_test, y_pred_perceptron))

# Ridge Classifier
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(X_train_no_id, y_train)
y_pred_ridge = ridge_classifier.predict(X_test_no_id)
print("Ridge Classifier:")
print(classification_report(y_test, y_pred_ridge))

# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_no_id, y_train)
y_pred_nb = nb.predict(X_test_no_id)
print("Multinomial Naive Bayes:")
print(classification_report(y_test, y_pred_nb))

# SGD Classifier
sgd_classifier = SGDClassifier()
sgd_classifier.fit(X_train_no_id, y_train)
y_pred_sgd = sgd_classifier.predict(X_test_no_id)
print("SGD Classifier:")
print(classification_report(y_test, y_pred_sgd))

# Linear SVC
linear_svc = LinearSVC()
linear_svc.fit(X_train_no_id, y_train)
y_pred_svc = linear_svc.predict(X_test_no_id)
print("Linear SVC:")
print(classification_report(y_test, y_pred_svc))




Logistic Regression:
              precision    recall  f1-score   support

           0       0.62      0.61      0.62     62033
           1       0.62      0.62      0.62     62032

    accuracy                           0.62    124065
   macro avg       0.62      0.62      0.62    124065
weighted avg       0.62      0.62      0.62    124065





Perceptron:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67     62033
           1       0.30      0.00      0.00     62032

    accuracy                           0.50    124065
   macro avg       0.40      0.50      0.33    124065
weighted avg       0.40      0.50      0.33    124065





Ridge Classifier:
              precision    recall  f1-score   support

           0       0.63      0.62      0.62     62033
           1       0.62      0.63      0.63     62032

    accuracy                           0.63    124065
   macro avg       0.63      0.63      0.63    124065
weighted avg       0.63      0.63      0.63    124065





ValueError: Negative values in data passed to MultinomialNB (input X)

In [None]:
# from sklearn.manifold import TSNE
# import pandas as pd

# tsne = TSNE(random_state=42)

# X_train_tsne = tsne.fit_transform(X_train_no_id)
# X_test_tsne = tsne.fit_transform(X_test_no_id)

# train_tsne_df = pd.DataFrame(X_train_tsne, columns=['tsne1', 'tsne2'])
# train_tsne_df['Class'] = y_train.values

# test_tsne_df = pd.DataFrame(X_test_tsne, columns=['tsne1', 'tsne2'])
# test_tsne_df['Class'] = y_test.values

# plt.figure(figsize=(12, 6))
# sns.scatterplot(x='tsne1', y='tsne2', hue='Class', data=train_tsne_df, palette='viridis')
# plt.title('t-SNE Visualization of Training Data')
# plt.show()

# plt.figure(figsize=(12, 6))
# sns.scatterplot(x='tsne1', y='tsne2', hue='Class', data=test_tsne_df, palette='viridis')
# plt.title('t-SNE Visualization of Test Data')
# plt.show()




KeyboardInterrupt: 