In [None]:
import pandas as pd
from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
from sklearn.svm import SVC

In [None]:
# Load training and test datasets
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [216]:
# Combine text and keyword columns as feature for training data
train_data['text'].fillna('', inplace=True)
train_data['keyword'].fillna('', inplace=True)
train_data['key_text'] = train_data['text'] + ' ' + train_data['keyword']
X = train_data['key_text']
y = train_data['target']

In [217]:
# Remove URLs, hashtags and special characters from dataset
stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))
def preprocess(data):
    formatted = []
    for sentence in data:
        tokens_sub = re.sub(r'http\S+', '', sentence)
        tokens_sub = re.sub(r'#(\w+)', r'\1', tokens_sub)
        tokens_sub = re.sub(r'[?!@#;:&.)=>\-\']', '', tokens_sub)
        tokens = word_tokenize(tokens_sub)
        tokens_lower = [word.lower() for word in tokens]
        tokens_stopwords = [word for word in tokens_lower if word not in stopwords]
        tokens_lem = [stemmer.stem(word) for word in tokens_stopwords]
        tokens_join = ' '.join(tokens_lem)
        formatted.append(tokens_join)
    return formatted

In [218]:
# Vectorize the text data using CountVectorizer
X = preprocess(X)
cv = CountVectorizer(binary=True)
X = cv.fit_transform(X)

In [219]:
# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0)

In [220]:
# Initialize the SVM model
svm = SVC()
svm.fit(X_train, y_train)

In [221]:
# Make predictions and check accuracy
y_pred = svm.predict(X_val)
score = accuracy_score(y_val, y_pred)
print(score)

0.8130252100840336


In [222]:
# Combine text and keyword columns as feature for testing data
test_data['text'].fillna('', inplace=True)
test_data['keyword'].fillna('', inplace=True)
test_data['key_text'] = test_data['text'] + ' ' + test_data['keyword']
X_test = test_data['key_text']

In [223]:
# Load submission dataset
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# Preprocess and predict target
X_test = preprocess(X_test)
X_test = cv.transform(X_test)
submission['target'] = svm.predict(X_test)
submission.head()
submission.to_csv('submission.csv', index=False)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
