In [28]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import TruncatedSVD

In [29]:
# Load data
df = pd.read_csv(r"C:\Users\gabri\Downloads\farm-ads (1).csv",header=None)

In [30]:
# Preview data
df.head()

Unnamed: 0,0,1
0,-1,ad-abdominal ad-aortic ad-aneurysm ad-doctorf...
1,-1,ad-abdominal ad-aortic ad-aneurysm ad-million...
2,-1,ad-absorbent ad-oil ad-snar ad-factory ad-dir...
3,-1,ad-acid ad-reflux ad-relief ad-top ad-treatme...
4,-1,ad-acid ad-reflux ad-symptom ad-acid ad-reflu...


In [31]:
print(df.columns)

Index([0, 1], dtype='int64')


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4143 entries, 0 to 4142
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       4143 non-null   int64 
 1   1       4143 non-null   object
dtypes: int64(1), object(1)
memory usage: 64.9+ KB


In [33]:
# Rename columns
df.columns = ['label', 'ad_text']

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4143 entries, 0 to 4142
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    4143 non-null   int64 
 1   ad_text  4143 non-null   object
dtypes: int64(1), object(1)
memory usage: 64.9+ KB


In [35]:
# Sample some relevant and non-relevant ads
relevant_ads = df[df['label'] == 1].sample(5)['ad_text'].tolist()
non_relevant_ads = df[df['label'] == -1].sample(5)['ad_text'].tolist()

print("\nSome relevant ads:")
for ad in relevant_ads:
    print(ad)

print("\nSome non-relevant ads:")
for ad in non_relevant_ads:
    print(ad)


Some relevant ads:
 ad-custom ad-polyclonal ad-quality ad-antiserum ad-rabbit ad-goat ad-sheep ad-host ad-llama title-capralogics capralogics capralogics inc po box hardwick ma usa phone fax info capralogics com capralogics virtual tour certify scrapie free goat dedicate polyclonal antibody service donor serum approve usda nih glp compliant art collection process facility rovide antibody service academic biotechnology pharmaceutical industry esearch antibody online store cit journal publication patent approve blood collection facility ec regulation quality reliable traceable polyclonal antibody service provide goat sheep llama rabbit lab service design peptide immunogen produce excellent polyclonal antibody capralogics verify history peer review scientific publication art collection process facility usda certify scrapie free goat produce volume serum plasma follow applicable regulatory requirement normal serum igg research antibody list www capralogics com anti brdu chemokine receptor

## Create Term-Document Matrix: We'll create a Term-Document matrix to represent the frequency of terms (words) in each document (ad text).

## 1. TDM (W/ no TF-idf) and CDM (LSA & LDA)

In [36]:
# Step 1: Create Term-Document matrix
vectorizer = CountVectorizer()
term_document_matrix = vectorizer.fit_transform(df['ad_text'])

In [37]:
# Step 2: Create Concept-Document matrix using LSA (TruncatedSVD)
lsa_model = TruncatedSVD(n_components=20)  # Limiting to 20 concepts
concept_document_matrix = lsa_model.fit_transform(term_document_matrix)

In [38]:
# Print shapes of matrices for verification
print("Shape of Term-Document matrix:", term_document_matrix.shape)
print("Shape of Concept-Document matrix:", concept_document_matrix.shape)

Shape of Term-Document matrix: (4143, 47513)
Shape of Concept-Document matrix: (4143, 20)


In [39]:
term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=vectorizer.get_feature_names_out())
term_document_df.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaaaew,aaaaaaaaato,aaaaaaaagy,aaahc,aaai,aaalac,...,zx,zxiuyxnweb,zxwrjqag,zxzg,zy,zydeco,zyla,zymosine,zyrtec,zzay
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Convert Concept-Document matrix to DataFrame for easier printing
concept_document_df = pd.DataFrame(concept_document_matrix, columns=[f'Concept {i+1}' for i in range(20)])

# Print the Concept-Document matrix
concept_document_df.head()

Unnamed: 0,Concept 1,Concept 2,Concept 3,Concept 4,Concept 5,Concept 6,Concept 7,Concept 8,Concept 9,Concept 10,Concept 11,Concept 12,Concept 13,Concept 14,Concept 15,Concept 16,Concept 17,Concept 18,Concept 19,Concept 20
0,4.592638,3.533337,-4.369313,3.81535,-4.754536,-0.004135,6.846347,1.399808,-1.473429,3.311815,1.067547,-1.833333,-1.006285,0.173722,-0.756714,-0.594307,-0.520746,0.542974,-0.084053,-1.500622
1,3.2906,3.329551,-3.705518,1.987368,-3.17291,0.135381,4.654209,0.606289,-0.914956,1.938429,0.575785,-1.017605,-0.261704,-0.068814,-0.394853,-0.124096,-0.633402,-0.220881,-0.022034,-0.080982
2,4.645337,4.073115,-4.313991,2.291744,-3.655147,0.149851,5.059414,0.416972,-0.919762,2.070414,0.557132,-1.149115,-0.135317,-0.171444,-0.478388,-0.080317,-0.854557,-0.550514,-0.513365,0.425989
3,4.032476,5.411087,-5.398386,3.037195,-1.228585,-0.25967,4.775181,0.871792,-1.098991,1.603535,0.272577,0.232379,0.041754,-0.580075,-0.384801,-0.007515,-1.092206,-0.239242,0.138636,0.625492
4,4.894422,6.024332,-6.192926,4.899283,0.678415,-0.804616,5.170028,1.372935,-1.290546,1.858129,0.440375,-1.122238,-0.520435,-0.409348,-0.51851,0.153125,-1.217466,-0.253745,0.250036,-0.087368


In [41]:
n_components = 20
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
concept_matrix = lda.fit_transform(term_document_matrix)

cdm = pd.DataFrame(concept_matrix, columns=[f'Concept {i+1}' for i in range(n_components)])

In [42]:
cdm.head()

Unnamed: 0,Concept 1,Concept 2,Concept 3,Concept 4,Concept 5,Concept 6,Concept 7,Concept 8,Concept 9,Concept 10,Concept 11,Concept 12,Concept 13,Concept 14,Concept 15,Concept 16,Concept 17,Concept 18,Concept 19,Concept 20
0,0.001724,0.001724,0.001724,0.001724,0.310829,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724,0.658136,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724,0.001724
1,0.001563,0.001563,0.001563,0.001563,0.288014,0.001563,0.001563,0.001563,0.001563,0.350134,0.001563,0.335289,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563
2,0.000481,0.113076,0.000481,0.000481,0.000481,0.067999,0.000481,0.000481,0.000481,0.000481,0.000481,0.776051,0.000481,0.000481,0.000481,0.000481,0.035182,0.000481,0.000481,0.000481
3,0.00122,0.00122,0.00122,0.00122,0.257259,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.72079,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122
4,0.095539,0.085673,0.001111,0.001111,0.137151,0.001111,0.001111,0.001111,0.001111,0.001111,0.001111,0.66386,0.001111,0.001111,0.001111,0.001111,0.001111,0.001111,0.001111,0.001111


## 1.1 TDM( with Tf-idf)

In [43]:
tfidf_vectorizer = TfidfVectorizer()

tdm_tfidf = tfidf_vectorizer.fit_transform(df['ad_text'])

tdm_tfidf_df = pd.DataFrame(tdm_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [44]:
print("Shape of Term-Document matrix with TF-idf:", tdm_tfidf_df.shape)

Shape of Term-Document matrix with TF-idf: (4143, 47513)


In [45]:
tdm_tfidf_df.head(50)

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaaaaaew,aaaaaaaaato,aaaaaaaagy,aaahc,aaai,aaalac,...,zx,zxiuyxnweb,zxwrjqag,zxzg,zy,zydeco,zyla,zymosine,zyrtec,zzay
0,0.0,0.295645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.232515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
tdm_tfidf_df.to_csv(r"C:\Users\gabri\Downloads\tdm_tfidf.csv")

## 2. Logistic Regression

In [47]:
# Split the data into training and validation sets (75% training, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(term_document_matrix, df['label'], test_size=0.25, random_state=42)

In [48]:
# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
# Predict labels for validation set
y_pred = model.predict(X_val)

In [50]:
# Create a DataFrame to display predicted and actual labels
results_df = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})

# Display the DataFrame
print(results_df)

      Actual  Predicted
2351       1          1
1966       1          1
1582      -1         -1
296       -1         -1
149       -1         -1
...      ...        ...
985       -1         -1
1138      -1         -1
4115       1         -1
1812      -1         -1
2780       1          1

[1036 rows x 2 columns]


In [51]:
# Evaluate model performance
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9256756756756757


In [52]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Classification Report:
              precision    recall  f1-score   support

          -1       0.92      0.93      0.92       504
           1       0.93      0.92      0.93       532

    accuracy                           0.93      1036
   macro avg       0.93      0.93      0.93      1036
weighted avg       0.93      0.93      0.93      1036

