### EAST AFRICA VIRTUAL HACKATHON 2022: SWAHILI SENTIMENT ANALYSIS CHALLENGE

## Let's Get Started 

In [None]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier #classifier

from sklearn.metrics import accuracy_score #evaluation metric
from sklearn.feature_extraction.text import CountVectorizer

# text preprocessing modules
import re 
from string import punctuation 

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

In [None]:
# load data
path = ''
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")
submission = pd.read_csv(path+"sample_submission.csv")

In [None]:
# show top five rows of train data
train.head() 

In [None]:
# show top five rows of test data
test.head()

In [None]:
# show top five rows of submision file
submission.head()

In [None]:
# check the shape of the train data
train.shape

In [None]:
# check the shape of the test data
test.shape

In [None]:
# check missing values in train data
train.isnull().sum()

In [None]:
# check missing values in test data
test.isnull().sum()

In [None]:
# evalute Labels distribution
train.Labels.value_counts()

### Data Preparation 

In [None]:
# a simple function to clean text data 

def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
        
    # Return a list of words
    return(text)

In [None]:
#clean the train and test data
train["Tweets"] = train["Tweets"].apply(text_cleaning)
test["Tweets"] = test["Tweets"].apply(text_cleaning)

In [None]:
#split features and target from train data 
X = train["Tweets"]
y = train.Labels.values

In [None]:
# Transform text data 
vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(X)

#transform train data 
X_transformed = vectorizer.transform(X)

#transform test data
test_transformed = vectorizer.transform(test["Tweets"])

In [None]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_transformed,
    y,
    test_size=0.10,
    random_state=42,
    shuffle=True,
    stratify=y,
)

### Create Classifier 

In [None]:
# Create a classifier
tweets_classifier = RandomForestClassifier() 

In [None]:
# train the tweets_classifier 
tweets_classifier.fit(X_train,y_train)

In [None]:
# test model performance on valid data 
y_preds = tweets_classifier.predict(X_valid)

In [None]:
# evalute model performance by using accuracy_score in the validation data
accuracy_score(y_valid, y_preds) 

In [None]:
# create prediction from the test data
test_preds = tweets_classifier.predict(test_transformed)

### Create Submission File

In [None]:
# create submission file 
submission["Labels"] = test_preds

In [None]:
# show sample submissoin rows
submission.head() 

In [None]:
# save submission file 
submission.to_csv(path+"first_submission.csv",index=False) 

Now upload your first submssion file on the hackathon page 👍