In [18]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "We are from TI 22 T NPU"
print(to_lowercase(sample_text))


we are from ti 22 t npu


In [19]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "Hi, how are you?"
print(remove_punctuation(sample_text))


Hi how are you


In [20]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "There are 13 students in TI 22 T."
print(remove_numbers(sample_text))


There are  students in TI  T.


In [1]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "We are from ti 22 t NPU."
print(tokenize(sample_text))


['We', 'are', 'from', 'ti', '22', 't', 'NPU', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "Neva and Najla is a member of TI 22 T class."
tokenized_text = tokenize(sample_text)
print(remove_stopwords(tokenized_text))


['Neva', 'Najla', 'member', 'TI', '22', 'T', 'class', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "I like watching movie."
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))


['i', 'like', 'watch', 'movi', '.']


In [2]:
sentence1 = "I love football"
sentence2 = "Messi is a great football player"
sentence3 = "Messi has won seven Ballon d’Or awards"

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['I love football', 'Messi is a great football player', 'Messi has won seven Ballon d’Or awards']


In [4]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,awards,ballon,football,great,has,is,love,messi,or,player,seven,won
0,0,0,1,0,0,0,1,0,0,0,0,0
1,0,0,1,1,0,1,0,1,0,1,0,0
2,1,1,0,0,1,0,0,1,1,0,1,1


In [5]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The lazy dog sleeps in the sun"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")


Term Frequency (TF):
Document 1 TF:
    the: 0.2222
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.1111
    dog: 0.1111
Document 2 TF:
    the: 0.2857
    lazy: 0.1429
    dog: 0.1429
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [6]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")



Inverse Document Frequency (IDF):
    jumps: 1.0000
    brown: 1.0000
    dog: 0.5945
    fox: 1.0000
    the: 0.5945
    quick: 1.0000
    in: 1.0000
    lazy: 0.5945
    sleeps: 1.0000
    sun: 1.0000
    over: 1.0000


In [7]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    the: 0.1321
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.0661
    dog: 0.0661
Document 2 TF-IDF:
    the: 0.1699
    lazy: 0.0849
    dog: 0.0849
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [8]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    'Ini adalah dokumen pertama.',
    'Dokumen kedua ini adalah contoh.',
    'Dan ini adalah dokumen ketiga.'
]

sentences = [doc.split() for doc in corpus]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    return np.mean([model.wv[word] for word in doc.split() if word in model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in corpus]
print(doc_vectors)


[array([-3.7941132e-03,  1.3971322e-03, -1.3302654e-03,  6.2948177e-03,
        7.5669785e-04, -9.2515646e-04,  2.1088016e-03,  3.0837939e-03,
       -5.7512512e-03,  7.2995876e-04,  2.0234017e-03, -3.2846206e-03,
        3.2237857e-03,  1.2359056e-03,  2.3005011e-03,  3.6598212e-04,
        1.4023911e-03, -2.5532098e-04, -4.4650156e-03, -6.0795713e-03,
        5.1451474e-04,  3.5236333e-03,  1.8079006e-03, -4.1199704e-03,
        6.4692814e-03, -2.5543815e-03, -6.6210038e-04,  2.7649710e-03,
       -3.8000084e-03, -3.2158426e-03,  3.2123891e-03, -2.4683301e-03,
        2.9981686e-03,  2.3309598e-03, -4.8931437e-03,  1.5661248e-03,
        1.9759887e-03,  2.2294575e-03,  1.7812600e-03, -2.5453365e-03,
       -3.2774555e-03,  4.1177375e-03, -3.2396559e-03, -8.4378332e-04,
        6.1413378e-04,  1.2738556e-03, -2.2212872e-03,  4.2844247e-03,
        1.2914615e-04,  3.0330098e-03, -1.0511086e-03,  2.5589571e-03,
       -3.1917188e-03, -9.2208735e-04,  1.8518131e-03, -5.1541473e-03,
     

In [9]:
!pip install numpy pandas scikit-learn




Klasifikasi teks dengan Machine Learning.
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.


In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))


HTTPError: HTTP Error 403: Forbidden