In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from joblib import dump

# Load and preprocess data from two different datasets
df1 = pd.read_csv('/content/sample_data/normie.csv')
df2 = pd.read_csv('/content/sample_data/dark_patterns.csv')

# Clean and preprocess the first dataset
df1 = df1[pd.notnull(df1["Pattern String"])]  # Remove rows with missing values
df1 = df1[df1["classification"] == 0]  # Filter only rows with classification 0
df1["classification"] = "Not Dark"  # Assign label "Not Dark"
df1.drop_duplicates(subset="Pattern String", inplace=True)  # Remove duplicate patterns

# Clean and preprocess the second dataset
df2 = df2[pd.notnull(df2["Pattern String"])]  # Remove rows with missing values
df2["classification"] = "Dark"  # Assign label "Dark"
col = ["Pattern String", "classification"]
df2 = df2[col]

# Concatenate the two datasets
df = pd.concat([df1, df2])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Pattern String'], df["classification"], train_size=0.25)

# Define a pipeline for text classification
text_clf = Pipeline([
    ('vect', CountVectorizer()),  # Convert text to a matrix of token counts
    ('tfidf', TfidfTransformer()),  # Transform a count matrix to a normalized tf-idf representation
    ('clf', RandomForestClassifier()),  # Random Forest classifier
])

# Train the model
text_clf.fit(X_train, y_train)

y_pred = text_clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = metrics.accuracy_score(y_pred, y_test)
print("Accuracy:", accuracy)

# Save the classifier and vectorizer separately
dump(text_clf.named_steps['clf'], 'presence_classifier_rf.joblib')
dump(text_clf.named_steps['vect'], 'presence_vectorizer_rf.joblib')


Accuracy: 0.9467561521252796


['presence_vectorizer_rf.joblib']