<a href="https://www.kaggle.com/code/abdelazizsami/quora-insincere-questions-classification?scriptVersionId=189785507" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 1. Download libraries and data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Download data
train = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')


# 2. Exploration and processing of data

In [2]:
# عرض بعض العينات من البيانات
print(train.head())

# تنظيف البيانات (يمكن تحسين هذا الجزء لاحقًا)
train['question_text'] = train['question_text'].str.replace('[^\w\s]', '').str.lower()

# تقسيم البيانات
X_train, X_val, y_train, y_val = train_test_split(train['question_text'], train['target'], test_size=0.2, random_state=42)


                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


# 3. Feature extraction using TF-IDF

In [3]:
# استخدام TF-IDF لاستخراج الميزات
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)


# 4. Build the model with increasing number of iterations

In [4]:
# زيادة عدد التكرارات
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


# 5. Model evaluation

In [5]:
# التنبؤ على بيانات التحقق
y_val_pred = model.predict(X_val_tfidf)

# حساب الدقة
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9543994640635467


# 6. Processing and making predictions on test data

In [6]:
# معالجة بيانات الاختبار
test['question_text'] = test['question_text'].str.replace('[^\w\s]', '').str.lower()
X_test_tfidf = vectorizer.transform(test['question_text'])

# التنبؤ على بيانات الاختبار
test_predictions = model.predict(X_test_tfidf)

# إنشاء ملف للرفع على Kaggle
submission = pd.DataFrame({'qid': test['qid'], 'prediction': test_predictions})
submission.to_csv('submission.csv', index=False)


# Optimization using StandardScaler in Pipeline

In [7]:
# إنشاء أنبوب يحتوي على محول المقياس والنموذج
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # with_mean=False لأن البيانات المصفوفة النادرة
    ('logreg', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train_tfidf, y_train)

# التنبؤ على بيانات التحقق باستخدام الأنبوب
y_val_pred = pipeline.predict(X_val_tfidf)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy with StandardScaler: {accuracy}')

# التنبؤ على بيانات الاختبار باستخدام الأنبوب
test_predictions = pipeline.predict(X_test_tfidf)
submission = pd.DataFrame({'qid': test['qid'], 'prediction': test_predictions})
submission.to_csv('submission_pipeline.csv', index=False)


Accuracy with StandardScaler: 0.9545449325294286
