In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import os

In [9]:
# Load the CSV files
# Get the current working directory
parent_directory = os.path.dirname(os.getcwd())
# Construct the full path to the CSV files
test_data_path = os.path.join(parent_directory, 'data', 'processed', 'csv_files', 'full_data', 'test.csv')
train_data_path = os.path.join(parent_directory, 'data', 'processed', 'csv_files', 'full_data', 'train.csv')
val_data_path = os.path.join(parent_directory, 'data', 'processed',  'csv_files', 'full_data', 'validation.csv')

# Load the CSV files
test_data = pd.read_csv(test_data_path)
train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)

In [10]:
# print shapes
print('Test data shape: ', test_data.shape) 
print('Train data shape: ', train_data.shape)
print('Validation data shape: ', val_data.shape)

Test data shape:  (53585, 2)
Train data shape:  (428673, 2)
Validation data shape:  (53584, 2)


In [11]:
train_data.head()

Unnamed: 0,text,label
0,"[Your Name]\n[Your Address]\n[City, State, Zip...",1
1,The use of The Facial Action Coding System To ...,1
2,"Its 1983, the stickiness of your fingers still...",0
3,Should America really go out without the commo...,0
4,Within countries around the world people are c...,0


In [12]:
# Prepare the data for tf-idf
tfidf_vectorizer = TfidfVectorizer()

In [18]:
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label']

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])
y_test = test_data['label']

X_val_tfidf = tfidf_vectorizer.transform(val_data['text'])
y_val = val_data['label']


In [20]:
# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict the labels on test dataset
predictions = classifier.predict(X_test_tfidf)
predictions_val = classifier.predict(X_val_tfidf)

# Calculate accuracy
accuracy_test = accuracy_score(y_test, predictions)

accuracy_val = accuracy_score(y_val, predictions_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(accuracy_test)
print(accuracy_val)

0.9953531771951105
0.9949611824425202


In [22]:
sgd_model = SGDClassifier(
    max_iter     = 9000, 
    tol          = 1e-4, 
    random_state = 6743,
    loss         = "modified_huber") 
sgd_model.fit(X_train_tfidf, y_train)

# Predict the labels on test dataset
predictions = sgd_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

0.992815153494448

In [24]:
# ALL CLASSIFIERS
sgd_model = SGDClassifier(
    max_iter     = 9000, 
    tol          = 1e-4, 
    random_state = 6743,
    loss         = "modified_huber") 

p={'verbose'          : -1,
    'n_iter'           : 3000,
    'colsample_bytree' : 0.7800,
    'colsample_bynode' : 0.8000, 
    'random_state'     : 6743,
    'metric'           : 'auc',
    'objective'        : 'cross_entropy',
    'learning_rate'    : 0.00581909898961407}
lgb=LGBMClassifier(**p)

cat = CatBoostClassifier(
    iterations        = 3000,
    verbose           = 0,
    subsample         = 0.35,
    random_seed       = 6543,
    allow_const_label = True,
    loss_function     = 'CrossEntropy',
    learning_rate     = 0.005599066836106983)
    

In [27]:
lgb.fit(X_train_tfidf, y_train)
predictions = lgb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
accuracy



0.9970887375198283

In [28]:
lgb.fit(X_train_tfidf, y_train, num_threads=)

TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'num_threads'

In [25]:
ensemble = VotingClassifier(
    estimators = [('mnb', clf),
                    ('sgd', sgd_model),
                    ('lgb', lgb), 
                    ('cat', cat)],
    weights    = [0.1, 0.31, 0.28, 0.67], 
    voting     = 'soft', 
    n_jobs     = -1
)

ensemble.fit(X_train_tfidf, y_train)

final_preds = ensemble.predict_proba(X_test_tfidf)



KeyboardInterrupt: 

In [4]:
from joblib import load
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump

dump(classifier, 'model.joblib')

# Assuming 'tfidf_vectorizer' is your fitted TfidfVectorizer
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')


# Load the model from the file
classifier = load('model.joblib')

# Now, the 'classifier' variable holds your loaded model, 


In [10]:
classifier.predict_proba(X_test_tfidf)[:,0].tolist()

float

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the CSV files
final_test = pd.read_csv('/kaggle/input/augmented-data-for-llm-detect-ai-generated-text/final_test.csv')
final_train = pd.read_csv('/kaggle/input/augmented-data-for-llm-detect-ai-generated-text/final_train.csv')
test = pd.read_csv('/kaggle/input/seconddataset/test.csv')
train = pd.read_csv('/kaggle/input/seconddataset/train.csv')

# Rename columns for consistency
test.rename(columns={'generated': 'label'}, inplace=True)
train.rename(columns={'generated': 'label'}, inplace=True)

# Merge the test files and train files
merged_test = pd.concat([final_test, test])
merged_train = pd.concat([final_train, train])

# Prepare the data for tf-idf
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(merged_train['text'])
y_train = merged_train['label']

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(merged_test['text'])
y_test = merged_test['label']

# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict the labels on test dataset
predictions = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)

accuracy