In [22]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName('nlp').getOrCreate()
df = spark.read.csv('u.item', sep='|')
df.show(3)

+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|              _c1|        _c2| _c3|                 _c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|
+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  0|  0|  1|  1|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  1|  1|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   0|   0|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   0|   0|
+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+--

In [23]:
df.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)



In [24]:
df = df.select(df.columns[0:5])

In [25]:
df = df.withColumnRenamed('_c0','Nu').withColumnRenamed('_c1','Title').withColumnRenamed('_c2','Date')


In [26]:
df.show(3)

+---+-----------------+-----------+----+--------------------+
| Nu|            Title|       Date| _c3|                 _c4|
+---+-----------------+-----------+----+--------------------+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|
+---+-----------------+-----------+----+--------------------+
only showing top 3 rows



In [27]:
from pyspark.sql.functions import length
df = df.withColumn('length', length(df['Title']))
df.show(3)



+---+-----------------+-----------+----+--------------------+------+
| Nu|            Title|       Date| _c3|                 _c4|length|
+---+-----------------+-----------+----+--------------------+------+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|    16|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|    16|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|    17|
+---+-----------------+-----------+----+--------------------+------+
only showing top 3 rows



In [28]:
tokenizer = Tokenizer(inputCol='Title', outputCol='words')
regex_tokenizer = RegexTokenizer(inputCol = 'Text', outputCol = 'words', pattern = '\\W')

count_tokens = udf(lambda words: len(words), IntegerType())
tokenized = tokenizer.transform(df)
tokenized.show(5)


+---+-----------------+-----------+----+--------------------+------+--------------------+
| Nu|            Title|       Date| _c3|                 _c4|length|               words|
+---+-----------------+-----------+----+--------------------+------+--------------------+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|    16|[toy, story, (1995)]|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|    16| [goldeneye, (1995)]|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|    17|[four, rooms, (19...|
|  4|Get Shorty (1995)|01-Jan-1995|null|http://us.imdb.co...|    17|[get, shorty, (19...|
|  5|   Copycat (1995)|01-Jan-1995|null|http://us.imdb.co...|    14|   [copycat, (1995)]|
+---+-----------------+-----------+----+--------------------+------+--------------------+
only showing top 5 rows



In [33]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered')
remover.transform(tokenized).show(3)

+---+-----------------+-----------+----+--------------------+------+--------------------+--------------------+
| Nu|            Title|       Date| _c3|                 _c4|length|               words|            filtered|
+---+-----------------+-----------+----+--------------------+------+--------------------+--------------------+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|    16|[toy, story, (1995)]|[toy, story, (1995)]|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|    16| [goldeneye, (1995)]| [goldeneye, (1995)]|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|    17|[four, rooms, (19...|[four, rooms, (19...|
+---+-----------------+-----------+----+--------------------+------+--------------------+--------------------+
only showing top 3 rows

